diff --git a/CODEOWNERS b/CODEOWNERS
index ec7993c060aa658a6dc70a0badf383a4830a70ca..5fc20409c276c87843fad370fe7ba726ef762970 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1 +1,56 @@
-*       @jeffra @samyam @tjruwase @ShadenSmith @conglongli @awan-10 @arashashari @cli99 @eltonzheng @minjiaz @RezaYazdaniAminabadi @niumanar
+# This file is used to subscribe for notifications for PRs
+# related to specific file paths, does not necessarily mean
+# approval is required from these people before merging.
+#
+# Learn more about CODEOWNERS syntax here:
+# https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners
+
+
+# top-level repo folders
+/.github/ @jeffra @mrwyattii
+/azure/ @jeffra @awan-10
+/benchmarks/ @jeffra @awan-10 @mrwyattii @molly-smith
+/bin/ @jeffra
+/csrc/ @RezaYazdaniAminabadi @awan-10 @jeffra @cmikeh2 @arashb
+/deepspeed/ @jeffra
+/docker/ @jeffra @awan-10
+/docs/ @jeffra @mrwyattii
+/examples/ @jeffra @awan-10 @mrwyattii
+/op_builder/ @jeffra @RezaYazdaniAminabadi @cmikeh2
+/release/ @jeffra @mrwyattii
+/requirements/ @jeffra @mrwyattii
+/scripts/ @jeffra @awan-10
+/tests/ @jeffra @mrwyattii @tjruwase
+
+# deepspeed
+/deepspeed/autotuning/ @cli99
+/deepspeed/checkpoint/ @tjruwase
+/deepspeed/comm/ @awan-10
+/deepspeed/compression/ @yaozhewei @minjiaz @xiaoxiawu-microsoft @conglongli
+/deepspeed/elasticity/ @jeffra @awan-10
+/deepspeed/launcher/ @jeffra @awan-10
+/deepspeed/module_inject/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
+/deepspeed/moe/ @awan-10
+/deepspeed/monitor/ @awan-10 @jeffra
+/deepspeed/nebula/ @tjruwase @jeffra
+/deepspeed/ops/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
+/deepspeed/pipe/ @ShadenSmith @duli2012
+/deepspeed/profiling/ @cli99
+/deepspeed/utils/ @jeffra @tjruwase @awan-10
+
+# inference
+/deepspeed/inference/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
+/deepspeed/model_implementations/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
+
+# training
+/deepspeed/runtime/ @jeffra @tjruwase
+/deepspeed/runtime/activation_checkpointing/ @jeffra @tjruwase
+/deepspeed/runtime/checkpoint_engine/ @tjruwase @jeffra
+/deepspeed/runtime/comm/ @awan-10
+/deepspeed/runtime/compression/ @awan-10 @conglongli
+/deepspeed/runtime/data_pipeline/ @conglongli
+/deepspeed/runtime/fp16/ @jeffra @tjruwase
+/deepspeed/runtime/fp16/onebit/ @conglongli @awan-10
+/deepspeed/runtime/pipe/ @ShadenSmith @duli2012
+/deepspeed/runtime/swap_tensor/ @tjruwase @mrwyattii
+/deepspeed/runtime/zero/ @jeffra @tjruwase @samyam @mrwyattii
diff --git a/MANIFEST.in b/MANIFEST.in
index a918b9286d530f57c67f5d3f80b9245bab541a3e..2fec750c66446173a3ff260f7e6c48eb301faabf 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,3 +2,6 @@ include *.txt README.md
 recursive-include requirements *.txt
 recursive-include deepspeed *.cpp *.h *.cu *.hip *.tr *.cuh *.cc *.json
 recursive-include csrc *.cpp *.h *.cu *.tr *.cuh *.cc
+recursive-include op_builder *.py
+recursive-include benchmarks *.py
+recursive-include accelerator *.py
diff --git a/MANIFEST_win.in b/MANIFEST_win.in
index ddfe73e0b4185fc2814d758d15d576113297b684..f0426fb00f189d28d71a6ffda98021960330bcfa 100644
--- a/MANIFEST_win.in
+++ b/MANIFEST_win.in
@@ -6,3 +6,4 @@ recursive-include deepspeed *.tr
 recursive-exclude deepspeed/ops/csrc *.cpp *.h *.cu *.cuh *.cc
 prune csrc
 prune op_builder
+prune accelerator
diff --git a/README.md b/README.md
old mode 100644
new mode 100755
index aafbbe5e79b470b12edc7d97e8c1c85ca7caf050..bfa03a6e8c9a8d2ead26e306d56d57cf42c67647
--- a/README.md
+++ b/README.md
@@ -1,75 +1,114 @@
-[![Build Status](https://github.com/microsoft/deepspeed/workflows/Build/badge.svg)](https://github.com/microsoft/DeepSpeed/actions)
+[![License MIT](https://badgen.net/badge/license/MIT/blue)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE)
 [![PyPI version](https://badge.fury.io/py/deepspeed.svg)](https://pypi.org/project/deepspeed/)
-[![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)
-[![License MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE)
+[![Downloads](https://pepy.tech/badge/deepspeed)](https://pepy.tech/project/deepspeed)
+[![Build](https://badgen.net/badge/build/check-status/blue)](#build-pipeline-status)
+
 
 <div align="center">
  <img src="docs/assets/images/DeepSpeed_light.svg#gh-light-mode-only" width="400px">
  <img src="docs/assets/images/DeepSpeed_dark_transparent.svg#gh-dark-mode-only" width="400px">
 </div>
 
-<!--
-Remove until pypi issue is resolved: https://status.python.org/incidents/2jj696st6yn5
-[![Downloads](https://pepy.tech/badge/deepspeed/month)](https://pepy.tech/project/deepspeed)
--->
 ## Latest News
-* [2022/03/21] [Supporting efficient large model training on AMD Instinct GPUs with DeepSpeed](https://cloudblogs.microsoft.com/opensource/2022/03/21/supporting-efficient-large-model-training-on-amd-instinct-gpus-with-deepspeed/)
-* [2022/03/07] [Maximizing Communication Efficiency for Large-scale Training via 0/1 Adam](https://www.deepspeed.ai/tutorials/zero-one-adam/)
-* [2022/01/19] [DeepSpeed: Advancing MoE inference and training to power next-generation AI scale](https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/)
-    * [Mixture of Experts (MoE) for NLG tutorial](https://www.deepspeed.ai/tutorials/mixture-of-experts-nlg/).
-    * [Mixture of Experts (MoE) Inference tutorial](https://www.deepspeed.ai/tutorials/moe-inference-tutorial).
-* [2021/11/15] [Autotuning: Automatically discover the optimal DeepSpeed configuration that delivers good training speed](https://www.deepspeed.ai/news/2021/11/15/autotuning.html)
-* [2021/10/11] [Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, the World’s Largest and Most Powerful Generative Language Model](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/)
-  * Read more on how to [train large models with DeepSpeed](https://www.deepspeed.ai/tutorials/large-models-w-deepspeed/)
-
-### DeepSpeed is hiring, [come join us!](https://careers.microsoft.com/us/en/search-results?keywords=http:%2F%2Fdeepspeed.ai)
+<b> DeepSpeed trained the world's most powerful language models ([MT-530B](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/), [BLOOM](https://huggingface.co/blog/bloom-megatron-deepspeed)); [learn how](https://www.deepspeed.ai/tutorials/large-models-w-deepspeed/).</b>
+
+* [2023/02] [Automatic Tensor Parallelism: Enables tensor parallelism by default without providing an injection policy](https://www.deepspeed.ai/tutorials/automatic-tensor-parallelism/)
+* [2022/12] [DeepSpeed Data Efficiency: A composable library that makes better use of data, increases training efficiency, and improves model quality](https://www.deepspeed.ai/2022/12/11/data-efficiency.html)
+* [2022/11] [Stable Diffusion Image Generation under 1 second w. DeepSpeed MII](https://github.com/microsoft/DeepSpeed-MII/tree/main/examples/benchmark/txt2img)
+* [2022/10] [DeepSpeed-MII: instant speedup on 24,000+ open-source DL models with up to 40x cheaper inference](https://www.deepspeed.ai/2022/10/10/mii.html)
+* [2022/09] [ZeRO-Inference: Democratizing massive model inference](https://www.deepspeed.ai/2022/09/09/zero-inference.html)
+* [2022/07] [Azure and DeepSpeed empower easy-to-use and high-performance model training](https://azure.microsoft.com/en-us/blog/azure-empowers-easytouse-highperformance-and-hyperscale-model-training-using-deepspeed/)
+
+---
+
+# Extreme Speed and Scale for DL Training and Inference
+
+[DeepSpeed](https://www.deepspeed.ai/) is an easy-to-use deep learning optimization software suite that enables unprecedented scale and speed for Deep Learning Training and Inference. With DeepSpeed you can:
+
+* Train/Inference dense or sparse models with billions or trillions of parameters
+* Achieve excellent system throughput and efficiently scale to thousands of GPUs
+* Train/Inference on resource constrained GPU systems
+* Achieve unprecedented low latency and high throughput for inference
+* Achieve extreme compression for an unparalleled inference latency and model size reduction with low costs
+
 ---
 
-[DeepSpeed](https://www.deepspeed.ai/) is a deep learning optimization
-library that makes distributed training easy, efficient, and effective.
+# DeepSpeed's three innovation pillars
+
+<img src="docs/assets/images/3pillars.png" width="800px">
+
+
+## DeepSpeed-Training
+
+DeepSpeed offers a confluence of system innovations, that has made large scale DL training effective, and efficient, greatly improved ease of use, and redefined the DL training landscape in terms of scale that is possible. These innovations such as ZeRO, 3D-Parallelism, DeepSpeed-MoE, ZeRO-Infinity, etc. fall under the training pillar. Learn more: [DeepSpeed-Training](https://www.deepspeed.ai/training/)
+
+## DeepSpeed-Inference
+
+DeepSpeed brings together innovations in parallelism technology such as tensor, pipeline, expert and ZeRO-parallelism, and combines them with high performance custom inference kernels, communication optimizations and heterogeneous memory technologies to enable inference at an unprecedented scale, while achieving unparalleled latency, throughput and cost reduction. This systematic composition of system technologies for inference falls under the inference pillar. Learn more: [DeepSpeed-Inference](https://www.deepspeed.ai/inference)
+
+
+## DeepSpeed-Compression
+
+To further increase the inference efficiency, DeepSpeed offers easy-to-use and flexible-to-compose compression techniques for researchers and practitioners to compress their models while delivering faster speed, smaller model size, and significantly reduced compression cost. Moreover, SoTA innovations on compression like ZeroQuant and XTC are included under the compression pillar. Learn more: [DeepSpeed-Compression](https://www.deepspeed.ai/compression)
+
+---
 
-<p align="center"><i><b>10x Larger Models</b></i></p>
-<p align="center"><i><b>10x Faster Training</b></i></p>
-<p align="center"><i><b>Minimal Code Change</b></i></p>
+# DeepSpeed Software Suite
 
-DeepSpeed delivers extreme-scale model training for everyone, from data scientists training on massive supercomputers to those training on low-end clusters or even on a single GPU:
-* Extreme scale: Using current generation of GPU clusters with hundreds of devices,  3D parallelism of DeepSpeed can efficiently train deep learning models with trillions of parameters.
-* Extremely memory efficient: With just a single GPU, ZeRO-Offload of DeepSpeed can train models with over 10B parameters, 10x bigger than the state of arts, democratizing multi-billion-parameter model training such that many deep learning scientists can explore bigger and better models.
-* Extremely long sequence length: Sparse attention of DeepSpeed powers an order-of-magnitude longer input sequence and obtains up to 6x faster execution comparing with dense transformers.
-* Extremely communication efficient: 3D parallelism improves communication efficiency allows users to train multi-billion-parameter models 2–7x faster on clusters with limited network bandwidth.  1-bit Adam, 0/1 Adam and 1-bit LAMB reduce communication volume by up to 26x while achieving similar convergence efficiency to Adam/LAMB, allowing for scaling to different types of GPU clusters and networks.
+## DeepSpeed Library
 
-Early adopters of DeepSpeed have already produced
-a language model (LM) with over 17B parameters called
-[Turing-NLG](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft),
-establishing a new SOTA in the LM category.
+   The [DeepSpeed](https://github.com/microsoft/deepspeed) library (this repository) implements and packages the innovations and technologies in DeepSpeed Training, Inference and Compression Pillars into a single easy-to-use, open-sourced repository. It allows for easy composition of multitude of features within a single training, inference or compression pipeline. The DeepSpeed Library is heavily adopted by the DL community, and has been used to enable some of the most powerful models (see [DeepSpeed Adoption](#deepspeed-adoption)).
+
+## Model Implementations for Inference (MII)
+
+   [Model Implementations for Inference (MII)](https://github.com/microsoft/deepspeed-mii) is an open-sourced repository for making low-latency and high-throughput inference accessible to all data scientists by alleviating the need to apply complex system optimization techniques themselves. Out-of-box, MII offers support for thousands of widely used DL models, optimized using DeepSpeed-Inference, that can be deployed with a few lines of code, while achieving significant latency reduction compared to their vanilla open-sourced versions.
+
+## DeepSpeed on Azure
+
+   DeepSpeed users are diverse and have access to different environments. We recommend to try DeepSpeed on Azure as it is the simplest and easiest method. The recommended method to try DeepSpeed on Azure is through AzureML [recipes](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk/workflows/train/deepspeed). The job submission and data preparation scripts have been made available [here](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azureml). For more details on how to use DeepSpeed on Azure, please follow the [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/).
+
+---
+
+# DeepSpeed Adoption
 
 DeepSpeed is an important part of Microsoft’s new
 [AI at Scale](https://www.microsoft.com/en-us/research/project/ai-at-scale/)
 initiative to enable next-generation AI capabilities at scale, where you can find more
 information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale).
 
-**_For further documentation, tutorials, and technical deep-dives please see [deepspeed.ai](https://www.deepspeed.ai/)!_**
-
-# Table of Contents
-| Section                                 | Description                                 |
-| --------------------------------------- | ------------------------------------------- |
-| [Why DeepSpeed?](#why-deepspeed)        |  DeepSpeed overview                         |
-| [Install](#installation)                |  Installation details                       |
-| [Features](#features)                   |  Feature list and overview                  |
-| [Further Reading](#further-reading)     |  Documentation, tutorials, etc.             |
-| [Contributing](#contributing)           |  Instructions for contributing              |
-| [Publications](#publications)           |  Publications related to DeepSpeed          |
-| [Videos](#videos)                       |  Videos related to DeepSpeed                |
-
-# Why DeepSpeed?
-Training advanced deep learning models is challenging. Beyond model design,
-model scientists also need to set up the state-of-the-art training techniques
-such as distributed training, mixed precision, gradient accumulation, and
-checkpointing. Yet still, scientists may not achieve the desired system
-performance and convergence rate. Large model sizes are even more challenging:
-a large model easily runs out of memory with pure data parallelism and it is
-difficult to use model parallelism. DeepSpeed addresses these challenges to
-accelerate model development *and* training.
+DeepSpeed has been used to train many different large-scale models, below is a list of several examples that we are aware of (if you'd like to include your model please submit a PR):
+
+  * [Megatron-Turing NLG (530B)](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/)
+  * [Jurassic-1 (178B)](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)
+  * [BLOOM (176B)](https://huggingface.co/blog/bloom-megatron-deepspeed)
+  * [GLM (130B)](https://github.com/THUDM/GLM-130B)
+  * [YaLM (100B)](https://github.com/yandex/YaLM-100B)
+  * [GPT-NeoX (20B)](https://github.com/EleutherAI/gpt-neox)
+  * [AlexaTM (20B)](https://www.amazon.science/blog/20b-parameter-alexa-model-sets-new-marks-in-few-shot-learning)
+  * [Turing NLG (17B)](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/)
+  * [METRO-LM (5.4B)](https://arxiv.org/pdf/2204.06644.pdf)
+
+DeepSpeed has been integrated with several different popular open-source DL frameworks such as:
+
+|                                                                                                | Documentation                                |
+| ---------------------------------------------------------------------------------------------- | -------------------------------------------- |
+<img src="docs/assets/images/transformers-light.png#gh-light-mode-only" width="250px"><img src="docs/assets/images/transformers-dark.png#gh-dark-mode-only" width="250px"> | [Transformers with DeepSpeed](https://huggingface.co/docs/transformers/main/main_classes/deepspeed) |
+| <img src="docs/assets/images/accelerate-light.png#gh-light-mode-only" width="250px"><img src="docs/assets/images/accelerate-dark.png#gh-dark-mode-only" width="250px"> | [Accelerate with DeepSpeed](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) |
+| <img src="docs/assets/images/lightning-light.svg#gh-light-mode-only" width="200px"><img src="docs/assets/images/lightning-dark.svg#gh-dark-mode-only" width="200px"> | [Lightning with DeepSpeed](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html) |
+| <img src="docs/assets/images/mosaicml.svg" width="200px"> | [MosaicML with DeepSpeed](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration) |
+| <img src="docs/assets/images/determined.svg" width="225px"> | [Determined with DeepSpeed](https://docs.determined.ai/latest/training/apis-howto/deepspeed/overview.html) |
+
+---
+
+# Build Pipeline Status
+
+| Description | Status |
+| ----------- | ------ |
+| NVIDIA | [![nv-torch12-p40](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch12-p40.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch12-p40.yml) [![nv-torch18-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch18-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch18-v100.yml) [![nv-torch-latest-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml) [![nv-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml) [![nv-nightly](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml) |
+| AMD | [![amd](https://github.com/microsoft/DeepSpeed/actions/workflows/amd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd.yml) |
+| PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) |
+| Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) |
+| Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)|
 
 # Installation
 
@@ -81,8 +120,16 @@ just-in-time (JIT) using [torch's JIT C++ extension loader that relies on
 ninja](https://pytorch.org/docs/stable/cpp_extension.html) to build and
 dynamically link them at runtime.
 
-**Note:** [PyTorch](https://pytorch.org/) must be installed _before_ installing
-DeepSpeed.
+## Requirements
+* [PyTorch](https://pytorch.org/) must be installed _before_ installing DeepSpeed.
+* For full feature support we recommend a version of PyTorch that is >= 1.8 and ideally the latest PyTorch stable release.
+* A CUDA or ROCm compiler such as [nvcc](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#introduction) or [hipcc](https://github.com/ROCm-Developer-Tools/HIPCC) used to compile C++/CUDA/HIP extensions.
+* Specific GPUs we develop and test against are listed below, this doesn't mean your GPU will not work if it doesn't fall into this category it's just DeepSpeed is most well tested on the following:
+  * NVIDIA: Pascal, Volta, Ampere, and Hopper architectures
+  * AMD: MI100 and MI200
+
+## PyPI
+We regularly push releases to [PyPI](https://pypi.org/project/deepspeed/) and encourage users to install from there in most cases.
 
 ```bash
 pip install deepspeed
@@ -99,83 +146,29 @@ If you would like to pre-install any of the DeepSpeed extensions/ops (instead
 of JIT compiling) or install pre-compiled ops via PyPI please see our [advanced
 installation instructions](https://www.deepspeed.ai/tutorials/advanced-install/).
 
-On Windows you can build wheel with following steps, currently only inference mode is supported.
+## Windows
+Windows support is partially supported with DeepSpeed. On Windows you can build wheel with following steps, currently only inference mode is supported.
 1. Install pytorch, such as pytorch 1.8 + cuda 11.1
 2. Install visual cpp build tools, such as VS2019 C++ x64/x86 build tools
 3. Launch cmd console with Administrator privilege for creating required symlink folders
 4. Run `python setup.py bdist_wheel` to build wheel in `dist` folder
 
 # Features
-Below we provide a brief feature list, see our detailed [feature
-overview](https://www.deepspeed.ai/features/) for descriptions and usage.
-
-* [Distributed Training with Mixed Precision](https://www.deepspeed.ai/features/#distributed-training-with-mixed-precision)
-  * 16-bit mixed precision
-  * Single-GPU/Multi-GPU/Multi-Node
-* [Model Parallelism](https://www.deepspeed.ai/features/#model-parallelism)
-  * Support for Custom Model Parallelism
-  * Integration with Megatron-LM
-* [Pipeline Parallelism](https://www.deepspeed.ai/tutorials/pipeline/)
-  * 3D Parallelism
-* [The Zero Redundancy Optimizer (ZeRO)](https://www.deepspeed.ai/tutorials/zero/)
-  * Optimizer State and Gradient Partitioning
-  * Activation Partitioning
-  * Constant Buffer Optimization
-  * Contiguous Memory Optimization
-* [ZeRO-Offload](https://www.deepspeed.ai/tutorials/zero-offload/)
-  * Leverage both CPU/GPU memory for model training
-  * Support 10B model training on a single GPU
-* [Ultra-fast dense transformer kernels](https://www.deepspeed.ai/2020/05/18/bert-record.html)
-* [Sparse attention](https://www.deepspeed.ai/2020/09/08/sparse-attention-news.html)
-  * Memory- and compute-efficient sparse kernels
-  * Support 10x longer sequences than dense
-  * Flexible support to different sparse structures
-* [1-bit Adam](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html), [0/1 Adam](https://www.deepspeed.ai/tutorials/zero-one-adam/) and [1-bit LAMB](https://www.deepspeed.ai/tutorials/onebit-lamb/)
-  * Custom communication collective
-  * Up to 26x communication volume saving
-* [Additional Memory and Bandwidth Optimizations](https://www.deepspeed.ai/features/#additional-memory-and-bandwidth-optimizations)
-  * Smart Gradient Accumulation
-  * Communication/Computation Overlap
-* [Training Features](https://www.deepspeed.ai/features/#training-features)
-  * Simplified training API
-  * Gradient Clipping
-  * Automatic loss scaling with mixed precision
-* [Training Optimizers](https://www.deepspeed.ai/features/#training-optimizers)
-  * Fused Adam optimizer and arbitrary `torch.optim.Optimizer`
-  * Memory bandwidth optimized FP16 Optimizer
-  * Large Batch Training with LAMB Optimizer
-  * Memory efficient Training with ZeRO Optimizer
-  * CPU-Adam
-* [Training Agnostic Checkpointing](https://www.deepspeed.ai/features/#training-agnostic-checkpointing)
-* [Advanced Parameter Search](https://www.deepspeed.ai/features/#advanced-parameter-search)
-  * Learning Rate Range Test
-  * 1Cycle Learning Rate Schedule
-* [Simplified Data Loader](https://www.deepspeed.ai/features/#simplified-data-loader)
-* [Curriculum Learning](https://www.deepspeed.ai/tutorials/curriculum-learning/)
-  * A curriculum learning-based data pipeline that presents easier or simpler examples earlier during training
-  * Stable and 3.3x faster GPT-2 pre-training with 8x/4x larger batch size/learning rate while maintaining token-wise convergence speed
-  * Complementary to many other DeepSpeed features
-* [Performance Analysis and Debugging](https://www.deepspeed.ai/features/#performance-analysis-and-debugging)
-* [Mixture of Experts (MoE)](https://www.deepspeed.ai/tutorials/mixture-of-experts/)
 
+Please checkout [DeepSpeed-Training](https://www.deepspeed.ai/training), [DeepSpeed-Inference](https://www.deepspeed.ai/inference) and [DeepSpeed-Compression](https://www.deepspeed.ai/compression) pages for full set of features offered along each of these three pillars.
 
 # Further Reading
 
-All DeepSpeed documentation can be found on our website: [deepspeed.ai](https://www.deepspeed.ai/)
+All DeepSpeed documentation, tutorials, and blogs can be found on our website: [deepspeed.ai](https://www.deepspeed.ai/)
 
 
-| Article                                                                                        | Description                                  |
+|                                                                                                | Description                                  |
 | ---------------------------------------------------------------------------------------------- | -------------------------------------------- |
-| [DeepSpeed Features](https://www.deepspeed.ai/features/)                                       |  DeepSpeed features                          |
 | [Getting Started](https://www.deepspeed.ai/getting-started/)                                   |  First steps with DeepSpeed                  |
 | [DeepSpeed JSON Configuration](https://www.deepspeed.ai/docs/config-json/)                     |  Configuring DeepSpeed                       |
 | [API Documentation](https://deepspeed.readthedocs.io/en/latest/)                               |  Generated DeepSpeed API documentation       |
-| [CIFAR-10 Tutorial](https://www.deepspeed.ai/tutorials/cifar-10)                               |  Getting started with CIFAR-10 and DeepSpeed |
-| [Megatron-LM Tutorial](https://www.deepspeed.ai/tutorials/megatron/)                           |  Train GPT2 with DeepSpeed and Megatron-LM   |
-| [BERT Pre-training Tutorial](https://www.deepspeed.ai/tutorials/bert-pretraining/)             |  Pre-train BERT with DeepSpeed               |
-| [Learning Rate Range Test Tutorial](https://www.deepspeed.ai/tutorials/lrrt/)                  |  Faster training with large learning rates   |
-| [1Cycle Tutorial](https://www.deepspeed.ai/tutorials/one-cycle/)                               |  SOTA learning schedule in DeepSpeed         |
-
+| [Tutorials](https://www.deepspeed.ai/tutorials/)                                               |  Tutorials                                   |
+| [Blogs](https://www.deepspeed.ai/posts/)                                                       |  Blogs                                   |
 
 
 # Contributing
@@ -204,13 +197,20 @@ Conduct](https://opensource.microsoft.com/codeofconduct/). For more information
 1. Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He. (2019) ZeRO: memory optimizations toward training trillion parameter models. [arXiv:1910.02054](https://arxiv.org/abs/1910.02054) and [In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC '20)](https://dl.acm.org/doi/10.5555/3433701.3433727).
 2. Jeff Rasley, Samyam Rajbhandari, Olatunji Ruwase, and Yuxiong He. (2020) DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters. [In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD '20, Tutorial)](https://dl.acm.org/doi/10.1145/3394486.3406703).
 3. Minjia Zhang, Yuxiong He. (2020) Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. [arXiv:2010.13369](https://arxiv.org/abs/2010.13369) and [NeurIPS 2020](https://proceedings.neurips.cc/paper/2020/hash/a1140a3d0df1c81e24ae954d935e8926-Abstract.html).
-4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840).
+4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840) and [USENIX ATC 2021](https://www.usenix.org/conference/atc21/presentation/ren-jie).
 5. Hanlin Tang, Shaoduo Gan, Ammar Ahmad Awan, Samyam Rajbhandari, Conglong Li, Xiangru Lian, Ji Liu, Ce Zhang, Yuxiong He. (2021) 1-bit Adam: Communication Efficient Large-Scale Training with Adam's Convergence Speed. [arXiv:2102.02888](https://arxiv.org/abs/2102.02888) and [ICML 2021](http://proceedings.mlr.press/v139/tang21a.html).
-6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857).
-7. Conglong Li, Ammar Ahmad Awan, Hanlin Tang, Samyam Rajbhandari, Yuxiong He. (2021) 1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed. [arXiv:2104.06069](https://arxiv.org/abs/2104.06069).
-8. Conglong Li, Minjia Zhang, Yuxiong He. (2021) Curriculum Learning: A Regularization Method for Efficient and Stable Billion-Scale GPT Model Pre-Training. [arXiv:2108.06084](https://arxiv.org/abs/2108.06084).
+6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857) and [SC 2021](https://dl.acm.org/doi/abs/10.1145/3458817.3476205).
+7. Conglong Li, Ammar Ahmad Awan, Hanlin Tang, Samyam Rajbhandari, Yuxiong He. (2021) 1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed. [arXiv:2104.06069](https://arxiv.org/abs/2104.06069) and [HiPC 2022](https://hipc.org/advance-program/).
+8. Conglong Li, Minjia Zhang, Yuxiong He. (2021) The Stability-Efficiency Dilemma: Investigating Sequence Length Warmup for Training GPT Models. [arXiv:2108.06084](https://arxiv.org/abs/2108.06084) and [NeurIPS 2022](https://openreview.net/forum?id=JpZ5du_Kdh).
 9. Yucheng Lu, Conglong Li, Minjia Zhang, Christopher De Sa, Yuxiong He. (2022) Maximizing Communication Efficiency for Large-scale Training via 0/1 Adam. [arXiv:2202.06009](https://arxiv.org/abs/2202.06009).
-10. Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He. (2022) DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale [arXiv:2201.05596](https://arxiv.org/abs/2201.05596).
+10. Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He. (2022) DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale [arXiv:2201.05596](https://arxiv.org/abs/2201.05596) and [ICML 2022](https://proceedings.mlr.press/v162/rajbhandari22a.html).
+11. Shaden Smith, Mostofa Patwary, Brandon Norick, Patrick LeGresley, Samyam Rajbhandari, Jared Casper, Zhun Liu, Shrimai Prabhumoye, George Zerveas, Vijay Korthikanti, Elton Zhang, Rewon Child, Reza Yazdani Aminabadi, Julie Bernauer, Xia Song, Mohammad Shoeybi, Yuxiong He, Michael Houston, Saurabh Tiwary, Bryan Catanzaro. (2022) Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model [arXiv:2201.11990](https://arxiv.org/abs/2201.11990).
+12. Xiaoxia Wu, Zhewei Yao, Minjia Zhang, Conglong Li, Yuxiong He. (2022) Extreme Compression for Pre-trained Transformers Made Simple and Efficient. [arXiv:2206.01859](https://arxiv.org/abs/2206.01859) and [NeurIPS 2022](https://openreview.net/forum?id=xNeAhc2CNAl).
+13. Zhewei Yao, Reza Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, Yuxiong He. (2022) ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers. [arXiv:2206.01861](https://arxiv.org/abs/2206.01861) and [NeurIPS 2022](https://openreview.net/forum?id=f-fVCElZ-G1).
+14. Reza Yazdani Aminabadi, Samyam Rajbhandari, Minjia Zhang, Ammar Ahmad Awan, Cheng Li, Du Li, Elton Zheng, Jeff Rasley, Shaden Smith, Olatunji Ruwase, Yuxiong He. (2022) DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale. [arXiv:2207.00032](https://arxiv.org/abs/2207.00032) and [SC 2022](https://dl.acm.org/doi/abs/10.5555/3571885.3571946).
+15. Zhewei Yao, Xiaoxia Wu, Conglong Li, Connor Holmes, Minjia Zhang, Cheng Li, Yuxiong He. (2022) Random-LTD: Random and Layerwise Token Dropping Brings Efficient Training for Large-scale Transformers. [arXiv:2211.11586](https://arxiv.org/abs/2211.11586).
+16. Conglong Li, Zhewei Yao, Xiaoxia Wu, Minjia Zhang, Yuxiong He. (2022) DeepSpeed Data Efficiency: Improving Deep Learning Model Quality and Training Efficiency via Efficient Data Sampling and Routing. [arXiv:2212.03597](https://arxiv.org/abs/2212.03597).
+
 
 # Videos
 1. DeepSpeed KDD 2020 Tutorial
@@ -225,5 +225,6 @@ Conduct](https://opensource.microsoft.com/codeofconduct/). For more information
     * [ZeRO & Fastest BERT: Increasing the scale and speed of deep learning training in DeepSpeed](https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html).
 3. [DeepSpeed on AzureML](https://youtu.be/yBVXR8G8Bg8)
 4. Community Tutorials
-    * [DeepSpeed: All the tricks to scale to gigantic models](https://www.youtube.com/watch?v=pDGI668pNg0)
-    * [Turing-NLG, DeepSpeed and the ZeRO optimizer](https://www.youtube.com/watch?v=tC01FRB0M7w)
+    * [DeepSpeed: All the tricks to scale to gigantic models (Mark Saroufim)](https://www.youtube.com/watch?v=pDGI668pNg0)
+    * [Turing-NLG, DeepSpeed and the ZeRO optimizer (Yannic Kilcher)](https://www.youtube.com/watch?v=tC01FRB0M7w)
+    * [Ultimate Guide To Scaling ML Models (The AI Epiphany)](https://www.youtube.com/watch?v=hc0u4avAkuM)
diff --git a/accelerator/__init__.py b/accelerator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d29c3b15467b979c02068488a4ac849e082286f1
--- /dev/null
+++ b/accelerator/__init__.py
@@ -0,0 +1,4 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .abstract_accelerator import DeepSpeedAccelerator
+from .real_accelerator import get_accelerator, set_accelerator
diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dfbe9df43dad69ce024c450f99ea2f6b794e6b
--- /dev/null
+++ b/accelerator/abstract_accelerator.py
@@ -0,0 +1,236 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import abc
+from abc import ABC
+
+
+class DeepSpeedAccelerator(ABC):
+    def __init__(self):
+        self._name = None
+        self._communication_backend_name = None
+
+    # Device APIs
+    @abc.abstractmethod
+    def device_name(self, device_index):
+        ...
+
+    @abc.abstractmethod
+    def device(self, device_index):
+        ...
+
+    @abc.abstractmethod
+    def set_device(self, device_index):
+        ...
+
+    @abc.abstractmethod
+    def current_device(self):
+        ...
+
+    @abc.abstractmethod
+    def current_device_name(self):
+        ...
+
+    @abc.abstractmethod
+    def device_count(self):
+        ...
+
+    @abc.abstractmethod
+    def synchronize(self, device_index=None):
+        ...
+
+    # RNG APIs
+    @abc.abstractmethod
+    def random(self):
+        ...
+
+    @abc.abstractmethod
+    def set_rng_state(self, new_state, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def get_rng_state(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def manual_seed(self, seed):
+        ...
+
+    @abc.abstractmethod
+    def manual_seed_all(self, seed):
+        ...
+
+    @abc.abstractmethod
+    def initial_seed(self, seed):
+        ...
+
+    @abc.abstractmethod
+    def default_generator(self, device_index):
+        ...
+
+    # Streams/Events
+    @property
+    @abc.abstractmethod
+    def Stream(self):
+        ...
+
+    @abc.abstractmethod
+    def stream(self, stream):
+        ...
+
+    @abc.abstractmethod
+    def current_stream(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def default_stream(self, device_index=None):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def Event(self):
+        ...
+
+    # Memory management
+    @abc.abstractmethod
+    def empty_cache(self):
+        ...
+
+    @abc.abstractmethod
+    def memory_allocated(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def max_memory_allocated(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def reset_max_memory_allocated(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def memory_cached(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def max_memory_cached(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def reset_max_memory_cached(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def memory_stats(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def reset_peak_memory_stats(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def memory_reserved(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def max_memory_reserved(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def total_memory(self, device_index=None):
+        ...
+
+    # Data types
+    @abc.abstractmethod
+    def is_bf16_supported(self):
+        ...
+
+    @abc.abstractmethod
+    def is_fp16_supported(self):
+        ...
+
+    # Misc
+    @abc.abstractmethod
+    def amp(self):
+        ...
+
+    @abc.abstractmethod
+    def is_available(self):
+        ...
+
+    @abc.abstractmethod
+    def range_push(self, msg):
+        ...
+
+    @abc.abstractmethod
+    def range_pop(self):
+        ...
+
+    @abc.abstractmethod
+    def lazy_call(self, callback):
+        ...
+
+    @abc.abstractmethod
+    def communication_backend_name(self):
+        ...
+
+    # Tensor operations
+    @property
+    @abc.abstractmethod
+    def BFloat16Tensor(self):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def ByteTensor(self):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def DoubleTensor(self):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def FloatTensor(self):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def HalfTensor(self):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def IntTensor(self):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def LongTensor(self):
+        ...
+
+    @abc.abstractmethod
+    def pin_memory(self, tensor):
+        ...
+
+    @abc.abstractmethod
+    def on_accelerator(self, tensor):
+        ...
+
+    @abc.abstractmethod
+    def op_builder_dir(self):
+        ...
+
+    # create an instance of op builder, specified by class_name
+    @abc.abstractmethod
+    def create_op_builder(self, class_name):
+        ...
+
+    # return an op builder class, specified by class_name
+    @abc.abstractmethod
+    def get_op_builder(self, class_name):
+        ...
+
+    @abc.abstractmethod
+    def build_extension(self):
+        ...
diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdf4cd447764fb4337643f181d9a1e9b12b7344f
--- /dev/null
+++ b/accelerator/cuda_accelerator.py
@@ -0,0 +1,254 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
+import pkgutil
+import importlib
+
+from .abstract_accelerator import DeepSpeedAccelerator
+# During setup stage torch may not be installed, pass on no torch will
+# allow op builder related API to be executed.
+try:
+    import torch.cuda
+except ImportError:
+    pass
+
+
+class CUDA_Accelerator(DeepSpeedAccelerator):
+    def __init__(self):
+        self._name = 'cuda'
+        self._communication_backend_name = 'nccl'
+
+        # begin initialize for create_op_builder()
+        # put all valid class name <--> class type mapping into class_dict
+        op_builder_dir = self.op_builder_dir()
+        op_builder_module = importlib.import_module(op_builder_dir)
+
+        for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(op_builder_module.__file__)]):
+            # avoid self references
+            if module_name != 'all_ops' and module_name != 'builder':
+                module = importlib.import_module("{}.{}".format(
+                    op_builder_dir,
+                    module_name))
+                for member_name in module.__dir__():
+                    if member_name.endswith(
+                            'Builder'
+                    ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "TorchCPUOpBuilder":  # avoid abstract classes
+                        if not member_name in self.class_dict:
+                            self.class_dict[member_name] = getattr(module, member_name)
+        # end initialize for create_op_builder()
+
+    # Device APIs
+    def device_name(self, device_index=None):
+        if device_index == None:
+            return 'cuda'
+        return 'cuda:{}'.format(device_index)
+
+    def device(self, device_index=None):
+        return torch.cuda.device(device_index)
+
+    def set_device(self, device_index):
+        torch.cuda.set_device(device_index)
+
+    def current_device(self):
+        return torch.cuda.current_device()
+
+    def current_device_name(self):
+        return 'cuda:{}'.format(torch.cuda.current_device())
+
+    def device_count(self):
+        return torch.cuda.device_count()
+
+    def synchronize(self, device_index=None):
+        return torch.cuda.synchronize(device_index)
+
+    # RNG APIs
+    def random(self):
+        return torch.random
+
+    def set_rng_state(self, new_state, device_index=None):
+        if device_index is None:
+            return torch.cuda.set_rng_state(new_state)
+
+        return torch.cuda.set_rng_state(new_state, device_index)
+
+    def get_rng_state(self, device_index=None):
+        if device_index is None:
+            return torch.cuda.get_rng_state()
+
+        return torch.cuda.get_rng_state(device_index)
+
+    def manual_seed(self, seed):
+        return torch.cuda.manual_seed(seed)
+
+    def manual_seed_all(self, seed):
+        return torch.cuda.manual_seed_all(seed)
+
+    def initial_seed(self, seed):
+        return torch.cuda.initial_seed(seed)
+
+    def default_generator(self, device_index):
+        return torch.cuda.default_generators[device_index]
+
+    # Streams/Events
+    @property
+    def Stream(self):
+        return torch.cuda.Stream
+
+    def stream(self, stream):
+        return torch.cuda.stream(stream)
+
+    def current_stream(self, device_index=None):
+        return torch.cuda.current_stream(device_index)
+
+    def default_stream(self, device_index=None):
+        return torch.cuda.default_stream(device_index)
+
+    @property
+    def Event(self):
+        return torch.cuda.Event
+
+    # Memory management
+    def empty_cache(self):
+        return torch.cuda.empty_cache()
+
+    def memory_allocated(self, device_index=None):
+        return torch.cuda.memory_allocated(device_index)
+
+    def max_memory_allocated(self, device_index=None):
+        return torch.cuda.max_memory_allocated(device_index)
+
+    def reset_max_memory_allocated(self, device_index=None):
+        return torch.cuda.reset_max_memory_allocated(device_index)
+
+    def memory_cached(self, device_index=None):
+        return torch.cuda.memory_cached(device_index)
+
+    def max_memory_cached(self, device_index=None):
+        return torch.cuda.max_memory_cached(device_index)
+
+    def reset_max_memory_cached(self, device_index=None):
+        return torch.cuda.reset_max_memory_cached(device_index)
+
+    def memory_stats(self, device_index=None):
+        if hasattr(torch.cuda, 'memory_stats'):
+            return torch.cuda.memory_stats(device_index)
+
+    def reset_peak_memory_stats(self, device_index=None):
+        if hasattr(torch.cuda, 'reset_peak_memory_stats'):
+            return torch.cuda.reset_peak_memory_stats(device_index)
+
+    def memory_reserved(self, device_index=None):
+        if hasattr(torch.cuda, 'memory_reserved'):
+            return torch.cuda.memory_reserved(device_index)
+
+    def max_memory_reserved(self, device_index=None):
+        if hasattr(torch.cuda, 'max_memory_reserved'):
+            return torch.cuda.max_memory_reserved(device_index)
+
+    def total_memory(self, device_index=None):
+        return torch.cuda.get_device_properties(device_index).total_memory
+
+    # Data types
+    def is_bf16_supported(self):
+        return torch.cuda.is_bf16_supported()
+
+    def is_fp16_supported(self):
+        major, _ = torch.cuda.get_device_capability()
+        if major >= 7:
+            return True
+        else:
+            return False
+
+    # Misc
+    def amp(self):
+        if hasattr(torch.cuda, 'amp'):
+            return torch.cuda.amp
+        return None
+
+    def is_available(self):
+        return torch.cuda.is_available()
+
+    def range_push(self, msg):
+        if hasattr(torch.cuda.nvtx, 'range_push'):
+            return torch.cuda.nvtx.range_push(msg)
+
+    def range_pop(self):
+        if hasattr(torch.cuda.nvtx, 'range_pop'):
+            return torch.cuda.nvtx.range_pop()
+
+    def lazy_call(self, callback):
+        return torch.cuda._lazy_call(callback)
+
+    def communication_backend_name(self):
+        return self._communication_backend_name
+
+    # Tensor operations
+
+    @property
+    def BFloat16Tensor(self):
+        return torch.cuda.BFloat16Tensor
+
+    @property
+    def ByteTensor(self):
+        return torch.cuda.ByteTensor
+
+    @property
+    def DoubleTensor(self):
+        return torch.cuda.DoubleTensor
+
+    @property
+    def FloatTensor(self):
+        return torch.cuda.FloatTensor
+
+    @property
+    def HalfTensor(self):
+        return torch.cuda.HalfTensor
+
+    @property
+    def IntTensor(self):
+        return torch.cuda.IntTensor
+
+    @property
+    def LongTensor(self):
+        return torch.cuda.LongTensor
+
+    def pin_memory(self, tensor):
+        return tensor.pin_memory()
+
+    def on_accelerator(self, tensor):
+        device_str = str(tensor.device)
+        if device_str.startswith('cuda:'):
+            return True
+        else:
+            return False
+
+    def op_builder_dir(self):
+        try:
+            # during installation time op_builder is visible, otherwise return deepspeed.ops.op_builder
+            import op_builder  # noqa: F401
+            return "op_builder"
+        except ImportError:
+            return "deepspeed.ops.op_builder"
+
+    # dict that holds class name <--> class type mapping i.e.
+    # 'AsyncIOBuilder': <class 'op_builder.async_io.AsyncIOBuilder'>
+    # this dict will be filled at init stage
+    class_dict = {}
+
+    # create an instance of op builder and return, name specified by class_name
+    def create_op_builder(self, class_name):
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]()
+        else:
+            return None
+
+    # return an op builder class, name specified by class_name
+    def get_op_builder(self, class_name):
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]
+        else:
+            return None
+
+    def build_extension(self):
+        from torch.utils.cpp_extension import BuildExtension
+        return BuildExtension
diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
new file mode 100644
index 0000000000000000000000000000000000000000..06cbb0b08e7a60075fde9500bbca7495799b5589
--- /dev/null
+++ b/accelerator/real_accelerator.py
@@ -0,0 +1,105 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+try:
+    from accelerator.abstract_accelerator import DeepSpeedAccelerator as dsa1
+except ImportError as e:
+    dsa1 = None
+try:
+    from deepspeed.accelerator.abstract_accelerator import DeepSpeedAccelerator as dsa2
+except ImportError as e:
+    dsa2 = None
+
+ds_accelerator = None
+
+
+def _validate_accelerator(accel_obj):
+    # because abstract_accelerator has different path during
+    # build time (accelerator.abstract_accelerator)
+    # and run time (deepspeed.accelerator.abstract_accelerator)
+    # and extension would import the
+    # run time abstract_accelerator/DeepSpeedAccelerator as its base
+    # class, so we need to compare accel_obj with both base class.
+    # if accel_obj is instance of DeepSpeedAccelerator in one of
+    # accelerator.abstractor_accelerator
+    # or deepspeed.accelerator.abstract_accelerator, consider accel_obj
+    # is a conforming object
+    if not ((dsa1 != None and isinstance(accel_obj,
+                                         dsa1)) or
+            (dsa2 != None and isinstance(accel_obj,
+                                         dsa2))):
+        raise AssertionError(
+            f'{accel_obj.__class__.__name__} accelerator is not subclass of DeepSpeedAccelerator'
+        )
+
+    # TODO: turn off is_available test since this breaks tests
+    #assert accel_obj.is_available(), \
+    #    f'{accel_obj.__class__.__name__} accelerator fails is_available() test'
+
+
+def get_accelerator():
+    global ds_accelerator
+    if ds_accelerator is None:
+        try:
+            from intel_extension_for_deepspeed import XPU_Accelerator
+        except ImportError as e:
+            pass
+        else:
+            ds_accelerator = XPU_Accelerator()
+            _validate_accelerator(ds_accelerator)
+            return ds_accelerator
+
+        from .cuda_accelerator import CUDA_Accelerator
+        ds_accelerator = CUDA_Accelerator()
+        _validate_accelerator(ds_accelerator)
+    return ds_accelerator
+
+
+def set_accelerator(accel_obj):
+    global ds_accelerator
+    _validate_accelerator(accel_obj)
+    ds_accelerator = accel_obj
+
+
+'''
+-----------[code] test_get.py -----------
+from deepspeed.accelerator import get_accelerator
+my_accelerator = get_accelerator()
+print(f'{my_accelerator._name=}')
+print(f'{my_accelerator._communication_backend=}')
+print(f'{my_accelerator.HalfTensor().device=}')
+print(f'{my_accelerator.total_memory()=}')
+-----------[code] test_get.py -----------
+
+---[output] python test_get.py---------
+my_accelerator.name()='cuda'
+my_accelerator.communication_backend='nccl'
+my_accelerator.HalfTensor().device=device(type='cuda', index=0)
+my_accelerator.total_memory()=34089730048
+---[output] python test_get.py---------
+
+**************************************************************************
+-----------[code] test_set.py -----------
+from deepspeed.accelerator.cuda_accelerator import CUDA_Accelerator
+cu_accel = CUDA_Accelerator()
+print(f'{id(cu_accel)=}')
+from deepspeed.accelerator import set_accelerator, get_accelerator
+set_accelerator(cu_accel)
+
+my_accelerator = get_accelerator()
+print(f'{id(my_accelerator)=}')
+print(f'{my_accelerator._name=}')
+print(f'{my_accelerator._communication_backend=}')
+print(f'{my_accelerator.HalfTensor().device=}')
+print(f'{my_accelerator.total_memory()=}')
+-----------[code] test_set.py -----------
+
+
+---[output] python test_set.py---------
+id(cu_accel)=139648165478304
+my_accelerator=<deepspeed.accelerator.cuda_accelerator.CUDA_Accelerator object at 0x7f025f4bffa0>
+my_accelerator.name='cuda'
+my_accelerator.communication_backend='nccl'
+my_accelerator.HalfTensor().device=device(type='cuda', index=0)
+my_accelerator.total_memory()=34089730048
+---[output] python test_set.py---------
+'''
diff --git a/azure/README.md b/azure/README.md
index 1cca695bfa7e1ef6a45a5f680134c97b86a46948..df222b9a2759f7e5e16516456c56689a15de1f6b 100644
--- a/azure/README.md
+++ b/azure/README.md
@@ -1,3 +1,3 @@
 # Getting Started with DeepSpeed on Azure
 
-Please see our [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/) to get started with DeepSpeed on Azure!
+The recommended and simplest method to try DeepSpeed on Azure is through [AzureML](https://azure.microsoft.com/en-us/services/machine-learning/). For more details, please see our [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/).
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcb45ab2b68516814a4bfbffebf2e01cbfefd527
--- /dev/null
+++ b/benchmarks/__init__.py
@@ -0,0 +1 @@
+'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/benchmarks/communication/README.md b/benchmarks/communication/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f760465b5c972f9b6364e7bbcfb11871ac079cf1
--- /dev/null
+++ b/benchmarks/communication/README.md
@@ -0,0 +1,75 @@
+# Running Communication Benchmarks
+
+
+To run benchmarks, there are two options:
+
+1. Run a single communication operation:
+
+For example, run with a single large message size:
+<pre>
+deepspeed all_reduce.py
+</pre>
+
+Scan across message sizes:
+<pre>
+deepspeed all_reduce.py --scan
+</pre>
+
+2. Run all available communication benchmarks:
+
+<pre>
+deepspeed run_all.py
+</pre>
+
+Like the individual benchmarks, `run_all.py` supports scanning arguments for the max message size, bw-unit, etc. Simply pass the desired arguments to `run_all.py` and they'll be propagated to each comm op.
+
+<pre>
+usage: ds_bench [-h] [--local_rank LOCAL_RANK] [--trials TRIALS] [--warmups WARMUPS] [--maxsize MAXSIZE] [--async-op] [--bw-unit {Gbps,GBps}] [--backend {nccl}] [--dist {deepspeed,torch}] [--scan] [--raw] [--all-reduce] [--all-gather] [--all-to-all]
+                [--pt2pt] [--broadcast] [--dtype DTYPE] [--mem-factor MEM_FACTOR] [--debug]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --local_rank LOCAL_RANK
+  --trials TRIALS       Number of timed iterations
+  --warmups WARMUPS     Number of warmup (non-timed) iterations
+  --maxsize MAXSIZE     Max message size as a power of 2
+  --async-op            Enables non-blocking communication
+  --bw-unit {Gbps,GBps}
+  --backend {nccl}      Communication library to use
+  --dist {deepspeed,torch}
+                        Distributed DL framework to use
+  --scan                Enables scanning all message sizes
+  --raw                 Print the message size and latency without units
+  --all-reduce          Run all_reduce
+  --all-gather          Run all_gather
+  --all-to-all          Run all_to_all
+  --pt2pt               Run pt2pt
+  --broadcast           Run broadcast
+  --dtype DTYPE         PyTorch tensor dtype
+  --mem-factor MEM_FACTOR
+                        Proportion of max available GPU memory to use for single-size evals
+  --debug               Enables all_to_all debug prints
+</pre>
+
+Note that `ds_bench` is a pre-packaged wrapper around `run_all.py`. Users can pass the same arguments as well:
+
+<pre>
+<path to deepspeed>/bin/ds_bench --scan --trials=10
+</pre>
+
+Finally, users can choose specific communication operations to run in `run_all.py` or `ds_bench` by passing them as arguments (all operations are run by default). For example:
+
+<pre>
+deepspeed run_all.py --scan --all-reduce --all-to-all --broadcast
+</pre>
+
+
+# Adding Communication Benchmarks
+
+To add new communication benchmarks, follow this general procedure:
+
+1. Copy a similar benchmark file (e.g. to add `reduce_scatter`, copy `all_reduce.py` as a template)
+2. Add a new bw formula in `utils.get_bw`, a new maximum tensor element formula in `utils.max_numel`, and a new arg in `utils.benchmark_parser`
+3. Replace comm op calls in new file with find-replace
+4. Find a good default `mem_factor` for use in `run_<collective>_single()` function
+5. Add new comm op to `run_all.py`
diff --git a/benchmarks/communication/__init__.py b/benchmarks/communication/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcb45ab2b68516814a4bfbffebf2e01cbfefd527
--- /dev/null
+++ b/benchmarks/communication/__init__.py
@@ -0,0 +1 @@
+'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/benchmarks/communication/all_gather.py b/benchmarks/communication/all_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc97267b384020e408739017c3b9051434211b34
--- /dev/null
+++ b/benchmarks/communication/all_gather.py
@@ -0,0 +1,159 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from benchmarks.communication.utils import *
+from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
+
+import time
+
+
+# Run all_gather and print metrics
+def timed_all_gather(input, output, args):
+    if args.dist == 'torch':
+        import torch.distributed as dist
+    elif args.dist == 'deepspeed':
+        import deepspeed.comm as dist
+
+    sync_all()
+    # Warmups, establish connections, etc.
+    for i in range(args.warmups):
+        # use all_gather_base if available
+        if args.dist == 'torch':
+            if hasattr(torch.distributed, "_all_gather_base"):
+                dist._all_gather_base(output, input, group=None, async_op=args.async_op)
+            else:
+                output_tensors = list(
+                    torch.chunk(output_tensor,
+                                cdb.get_world_size(group)))
+                dist.all_gather(output_tensors, input_tensor, group=group, async_op=True)
+        elif args.dist == 'deepspeed':
+            dist.allgather_fn(output, input, group=None, async_op=args.async_op)
+    sync_all()
+
+    # time the actual comm op trials times and average it
+    pre = time.perf_counter()
+    for i in range(args.trials):
+        # use all_gather_base if available
+        if args.dist == 'torch':
+            if hasattr(torch.distributed, "_all_gather_base"):
+                dist._all_gather_base(output, input, group=None, async_op=args.async_op)
+            else:
+                output_tensors = list(
+                    torch.chunk(output_tensor,
+                                cdb.get_world_size(group)))
+                dist.all_gather(output_tensors, input_tensor, group=group, async_op=True)
+        elif args.dist == 'deepspeed':
+            dist.allgather_fn(output, input, group=None, async_op=args.async_op)
+    sync_all()
+    duration = time.perf_counter() - pre
+
+    # maintain and clean performance data
+    avg_duration = duration / args.trials
+    size = input.element_size() * input.nelement()
+    n = dist.get_world_size()
+    tput, busbw = get_bw('all_gather', size, avg_duration, args)
+    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
+    desc = f'{input.nelement()}x{input.element_size()}'
+
+    if not args.raw:
+        size = convert_size(size)
+
+    print_rank_0(
+        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
+
+
+def run_all_gather(local_rank, args):
+    if args.dist == 'torch':
+        import torch.distributed as dist
+    elif args.dist == 'deepspeed':
+        import deepspeed.comm as dist
+
+    # Prepare benchmark header
+    print_header(args, 'all_gather')
+    global_rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    if args.scan:
+        # Create list of message sizes
+        M_LIST = []
+        for x in (2**p for p in range(1, args.maxsize)):
+            M_LIST.append(x)
+
+        sync_all()
+        # loop over various tensor sizes
+        for M in M_LIST:
+            global_rank = dist.get_rank()
+            try:
+                mat = torch.ones(world_size,
+                                 M,
+                                 dtype=getattr(
+                                     torch,
+                                     args.dtype)).to(
+                                         get_accelerator().device_name(local_rank))
+                sync_all()
+                input = ((mat.mul_(float(global_rank))).view(-1))
+                # Delete original mat to avoid OOM
+                del mat
+                get_accelerator().empty_cache()
+                output = torch.zeros(input.nelement() * world_size,
+                                     dtype=getattr(
+                                         torch,
+                                         args.dtype)).to(
+                                             get_accelerator().device_name(local_rank))
+            except RuntimeError as e:
+                if 'out of memory' in str(e):
+                    if dist.get_rank() == 0:
+                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
+                    sync_all()
+                    break
+            sync_all()
+            timed_all_gather(input, output, args)
+    else:
+        # all_gather_base saves memory
+        if (args.dist == 'torch'
+                and hasattr(torch.distributed,
+                            "_all_gather_base")) or (args.dist == 'deepspeed'
+                                                     and dist.has_allgather_base):
+            mem_factor = args.mem_factor + 0.2
+        else:
+            mem_factor = args.mem_factor
+        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
+        sync_all()
+        elements_per_gpu = max_numel(comm_op='all_gather',
+                                     dtype=getattr(torch,
+                                                   args.dtype),
+                                     mem_factor=mem_factor,
+                                     local_rank=local_rank,
+                                     args=args)
+        try:
+            mat = torch.ones(elements_per_gpu,
+                             dtype=getattr(torch,
+                                           args.dtype)).to(
+                                               get_accelerator().device_name(local_rank))
+            # multiply each GPU's tensor by the rank to ease debugging
+            input = ((mat.mul_(float(global_rank))).view(-1))
+            # Delete original mat to avoid OOM
+            del mat
+            get_accelerator().empty_cache()
+            output = torch.zeros(
+                elements_per_gpu * world_size,
+                dtype=getattr(torch,
+                              args.dtype)).to(get_accelerator().device_name(local_rank))
+        except RuntimeError as e:
+            if 'out of memory' in str(e):
+                if dist.get_rank() == 0:
+                    print(
+                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
+                    )
+                sync_all()
+                return
+
+        sync_all()
+        timed_all_gather(input, output, args)
+
+
+if __name__ == "__main__":
+    args = benchmark_parser().parse_args()
+    rank = args.local_rank
+    init_processes(local_rank=rank, args=args)
+    run_all_gather(local_rank=rank, args=args)
diff --git a/benchmarks/communication/all_reduce.py b/benchmarks/communication/all_reduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..edc1b99301c06e7c8c4b5807e35ad2afa39bf17b
--- /dev/null
+++ b/benchmarks/communication/all_reduce.py
@@ -0,0 +1,113 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from benchmarks.communication.utils import *
+from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
+
+import time
+
+
+def timed_all_reduce(input, args):
+    if args.dist == 'torch':
+        import torch.distributed as dist
+    elif args.dist == 'deepspeed':
+        import deepspeed.comm as dist
+
+    sync_all()
+    # Warmups, establish connections, etc.
+    for i in range(args.warmups):
+        dist.all_reduce(input, async_op=args.async_op)
+    sync_all()
+
+    # time the actual comm op trials times and average it
+    pre = time.perf_counter()
+    for i in range(args.trials):
+        dist.all_reduce(input, async_op=args.async_op)
+    sync_all()
+    duration = time.perf_counter() - pre
+
+    # maintain and clean performance data
+    avg_duration = duration / args.trials
+    size = input.element_size() * input.nelement()
+    n = dist.get_world_size()
+    tput, busbw = get_bw('all_reduce', size, avg_duration, args)
+    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
+    desc = f'{input.nelement()}x{input.element_size()}'
+
+    if not args.raw:
+        size = convert_size(size)
+
+    print_rank_0(
+        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
+
+
+def run_all_reduce(local_rank, args):
+    if args.dist == 'torch':
+        import torch.distributed as dist
+    elif args.dist == 'deepspeed':
+        import deepspeed.comm as dist
+
+    # Prepare benchmark header
+    print_header(args, 'all_reduce')
+
+    world_size = dist.get_world_size()
+    global_rank = dist.get_rank()
+
+    if args.scan:
+        M_LIST = []
+        for x in (2**p for p in range(1, args.maxsize)):
+            M_LIST.append(x)
+
+        sync_all()
+        # loop over various tensor sizes
+        for M in M_LIST:
+            global_rank = dist.get_rank()
+            try:
+                mat = torch.ones(world_size,
+                                 M,
+                                 dtype=getattr(
+                                     torch,
+                                     args.dtype)).to(
+                                         get_accelerator().device_name(local_rank))
+                sync_all()
+                input = ((mat.mul_(float(global_rank))).view(-1))
+            except RuntimeError as e:
+                if 'out of memory' in str(e):
+                    if dist.get_rank() == 0:
+                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
+                    sync_all()
+                    break
+            sync_all()
+            timed_all_reduce(input, args)
+    else:
+        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
+        # Don't need output tensor, so we double mem_factor
+        elements_per_gpu = max_numel(comm_op='all_reduce',
+                                     dtype=getattr(torch,
+                                                   args.dtype),
+                                     mem_factor=args.mem_factor * 2,
+                                     local_rank=local_rank,
+                                     args=args)
+        try:
+            mat = torch.ones(elements_per_gpu,
+                             dtype=getattr(torch,
+                                           args.dtype)).to(
+                                               get_accelerator().device_name(local_rank))
+            input = ((mat.mul_(float(global_rank))).view(-1))
+        except RuntimeError as e:
+            if 'out of memory' in str(e):
+                if dist.get_rank() == 0:
+                    print(
+                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
+                    )
+                sync_all()
+                return
+        sync_all()
+        timed_all_reduce(input, args)
+
+
+if __name__ == "__main__":
+    args = benchmark_parser().parse_args()
+    rank = args.local_rank
+    init_processes(local_rank=rank, args=args)
+    run_all_reduce(local_rank=rank, args=args)
diff --git a/benchmarks/communication/all_to_all.py b/benchmarks/communication/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd35cf290e4c0f35b2b2ce4b2c4ea17f876e52ec
--- /dev/null
+++ b/benchmarks/communication/all_to_all.py
@@ -0,0 +1,134 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from benchmarks.communication.utils import *
+from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
+
+import time
+
+
+def timed_all_to_all(input, output, args):
+    if args.dist == 'torch':
+        import torch.distributed as dist
+    elif args.dist == 'deepspeed':
+        import deepspeed.comm as dist
+
+    sync_all()
+    # Warmups, establish connections, etc.
+    for i in range(args.warmups):
+        dist.all_to_all_single(output, input, async_op=args.async_op)
+    sync_all()
+
+    # time the actual comm op trials times and average it
+    pre = time.perf_counter()
+    for i in range(args.trials):
+        dist.all_to_all_single(output, input, async_op=args.async_op)
+    sync_all()
+    duration = time.perf_counter() - pre
+
+    # maintain and clean performance data
+    avg_duration = duration / args.trials
+    size = input.element_size() * input.nelement()
+    n = dist.get_world_size()
+    tput, busbw = get_bw('all_to_all', size, avg_duration, args)
+    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
+    desc = f'{input.nelement()}x{input.element_size()}'
+
+    if not args.raw:
+        size = convert_size(size)
+
+    print_rank_0(
+        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
+
+
+def run_all_to_all(local_rank, args):
+    if args.dist == 'torch':
+        import torch.distributed as dist
+    elif args.dist == 'deepspeed':
+        import deepspeed.comm as dist
+
+    world_size = dist.get_world_size()
+    global_rank = dist.get_rank()
+    # Prepare benchmark header
+    print_header(args, 'all_to_all')
+
+    if args.scan:
+        M_LIST = []
+        for x in (2**p for p in range(1, args.maxsize)):
+            M_LIST.append(x)
+
+        sync_all()
+        # loop over various tensor sizes
+        for M in M_LIST:
+            global_rank = dist.get_rank()
+            try:
+                mat = torch.ones(world_size,
+                                 M,
+                                 dtype=getattr(
+                                     torch,
+                                     args.dtype)).to(
+                                         get_accelerator().device_name(local_rank))
+                assert mat.numel() % world_size == 0, f"tensor cannot be divided in {world_size} chunks"
+                sync_all()
+                input = ((mat.mul_(float(global_rank))).view(-1))
+                output = (mat.clone().view(-1))
+            except RuntimeError as e:
+                if 'out of memory' in str(e):
+                    if dist.get_rank() == 0:
+                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
+                    sync_all()
+                    break
+            sync_all()
+            timed_all_to_all(input, output, args)
+    else:
+        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
+        elements_per_gpu = max_numel(comm_op='all_to_all',
+                                     dtype=getattr(torch,
+                                                   args.dtype),
+                                     mem_factor=args.mem_factor,
+                                     local_rank=local_rank,
+                                     args=args)
+        try:
+            mat = torch.ones(elements_per_gpu,
+                             dtype=getattr(torch,
+                                           args.dtype)).to(
+                                               get_accelerator().device_name(local_rank))
+            assert mat.numel() % world_size == 0, f"tensor with {mat.numel()} elements cannot be divided in {world_size} chunks"
+            input = ((mat.mul_(float(global_rank))).view(-1))
+            # Delete original mat to avoid OOM
+            del mat
+            get_accelerator().empty_cache()
+            output = torch.zeros(
+                elements_per_gpu,
+                dtype=getattr(torch,
+                              args.dtype)).to(get_accelerator().device_name(local_rank))
+        except RuntimeError as e:
+            if 'out of memory' in str(e):
+                if dist.get_rank() == 0:
+                    print(
+                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
+                    )
+                sync_all()
+                return
+        sync_all()
+
+        if args.debug:
+            for i in range(world_size):
+                if i == global_rank:
+                    print(f"Before AllToAll Input List at rank {global_rank}: {input}")
+                dist.barrier()
+
+        timed_all_to_all(input, output, args)
+
+        if args.debug:
+            for i in range(world_size):
+                if i == global_rank:
+                    print(f"AllToAll Results at rank {global_rank}: {output}")
+                dist.barrier()
+
+
+if __name__ == "__main__":
+    args = benchmark_parser().parse_args()
+    rank = args.local_rank
+    init_processes(local_rank=rank, args=args)
+    run_all_to_all(local_rank=rank, args=args)
diff --git a/benchmarks/communication/broadcast.py b/benchmarks/communication/broadcast.py
new file mode 100644
index 0000000000000000000000000000000000000000..633e46638fac775920b89b0fc2ab9b0f4401dc79
--- /dev/null
+++ b/benchmarks/communication/broadcast.py
@@ -0,0 +1,114 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from benchmarks.communication.utils import *
+from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
+
+import time
+
+
+def timed_broadcast(input, args):
+    if args.dist == 'torch':
+        import torch.distributed as dist
+    elif args.dist == 'deepspeed':
+        import deepspeed.comm as dist
+
+    sync_all()
+    # Warmups, establish connections, etc.
+    for i in range(args.warmups):
+        dist.broadcast(input, 0, async_op=args.async_op)
+    sync_all()
+
+    # time the actual comm op trials times and average it
+    pre = time.perf_counter()
+    for i in range(args.trials):
+        dist.broadcast(input, 0, async_op=args.async_op)
+    sync_all()
+    duration = time.perf_counter() - pre
+
+    # maintain and clean performance data
+    avg_duration = duration / args.trials
+    size = input.element_size() * input.nelement()
+    n = dist.get_world_size()
+    tput, busbw = get_bw('broadcast', size, avg_duration, args)
+    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
+    desc = f'{input.nelement()}x{input.element_size()}'
+
+    if not args.raw:
+        size = convert_size(size)
+
+    print_rank_0(
+        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
+
+
+def run_broadcast(local_rank, args):
+    if args.dist == 'torch':
+        import torch.distributed as dist
+    elif args.dist == 'deepspeed':
+        import deepspeed.comm as dist
+
+    # Prepare benchmark header
+    print_header(args, 'broadcast')
+
+    world_size = dist.get_world_size()
+    global_rank = dist.get_rank()
+
+    if args.scan:
+        M_LIST = []
+        for x in (2**p for p in range(1, args.maxsize)):
+            M_LIST.append(x)
+
+        sync_all()
+        # loop over various tensor sizes
+        for M in M_LIST:
+            global_rank = dist.get_rank()
+            try:
+                mat = torch.ones(world_size,
+                                 M,
+                                 dtype=getattr(
+                                     torch,
+                                     args.dtype)).to(
+                                         get_accelerator().device_name(local_rank))
+                sync_all()
+                input = ((mat.mul_(float(global_rank))).view(-1))
+            except RuntimeError as e:
+                if 'out of memory' in str(e):
+                    if dist.get_rank() == 0:
+                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
+                    sync_all()
+                    break
+            sync_all()
+            timed_broadcast(input, args)
+    else:
+        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
+        # Don't need output tensor, so we double mem_factor
+        elements_per_gpu = max_numel(comm_op='broadcast',
+                                     dtype=getattr(torch,
+                                                   args.dtype),
+                                     mem_factor=args.mem_factor * 2,
+                                     local_rank=local_rank,
+                                     args=args)
+        try:
+            mat = torch.ones(elements_per_gpu,
+                             dtype=getattr(torch,
+                                           args.dtype)).to(
+                                               get_accelerator().device_name(local_rank))
+            input = ((mat.mul_(float(global_rank))).view(-1))
+        except RuntimeError as e:
+            if 'out of memory' in str(e):
+                if dist.get_rank() == 0:
+                    print(
+                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
+                    )
+                sync_all()
+                return
+        sync_all()
+        timed_broadcast(input, args)
+
+
+if __name__ == "__main__":
+    args = benchmark_parser().parse_args()
+    rank = args.local_rank
+    init_processes(local_rank=rank, args=args)
+    run_broadcast(local_rank=rank, args=args)
diff --git a/benchmarks/communication/constants.py b/benchmarks/communication/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..935927acd174256fe7cd552a3181977e2dfdb7d8
--- /dev/null
+++ b/benchmarks/communication/constants.py
@@ -0,0 +1,10 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+from deepspeed.accelerator import get_accelerator
+
+DEFAULT_WARMUPS = 5
+DEFAULT_TRIALS = 50
+DEFAULT_TYPE = 'float'
+DEFAULT_BACKEND = get_accelerator().communication_backend_name()
+DEFAULT_UNIT = 'Gbps'
+DEFAULT_DIST = 'deepspeed'
+DEFAULT_MAXSIZE = 24
diff --git a/benchmarks/communication/pt2pt.py b/benchmarks/communication/pt2pt.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c890fc42e93585a067ef815f163a96d069096ef
--- /dev/null
+++ b/benchmarks/communication/pt2pt.py
@@ -0,0 +1,132 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from benchmarks.communication.utils import *
+from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
+
+import time
+
+
+def timed_pt2pt(input, args):
+    if args.dist == 'torch':
+        import torch.distributed as dist
+    elif args.dist == 'deepspeed':
+        import deepspeed.comm as dist
+
+    sync_all()
+    # Warmups, establish connections, etc.
+    for i in range(args.warmups):
+        if dist.get_rank() == 0:
+            if args.async_op:
+                dist.isend(input, 1)
+            else:
+                dist.send(input, 1)
+        if dist.get_rank() == 1:
+            if args.async_op:
+                dist.irecv(input, src=0)
+            else:
+                dist.recv(input, src=0)
+    sync_all()
+
+    # time the actual comm op trials times and average it
+    pre = time.perf_counter()
+    for i in range(args.trials):
+        if dist.get_rank() == 0:
+            if args.async_op:
+                dist.isend(input, 1)
+            else:
+                dist.send(input, 1)
+        if dist.get_rank() == 1:
+            if args.async_op:
+                dist.irecv(input, src=0)
+            else:
+                dist.recv(input, src=0)
+
+    sync_all()
+    duration = time.perf_counter() - pre
+
+    # maintain and clean performance data
+    avg_duration = duration / args.trials
+    size = input.element_size() * input.nelement()
+    n = dist.get_world_size()
+    tput, busbw = get_bw('pt2pt', size, avg_duration, args)
+    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
+    desc = f'{input.nelement()}x{input.element_size()}'
+
+    if not args.raw:
+        size = convert_size(size)
+
+    print_rank_0(
+        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
+
+
+def run_pt2pt(local_rank, args):
+    if args.dist == 'torch':
+        import torch.distributed as dist
+    elif args.dist == 'deepspeed':
+        import deepspeed.comm as dist
+
+    # Prepare benchmark header
+    print_header(args, 'pt2pt')
+    global_rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    if args.scan:
+        # Create list of message sizes
+        M_LIST = []
+        for x in (2**p for p in range(1, args.maxsize)):
+            M_LIST.append(x)
+
+        sync_all()
+        # loop over various tensor sizes
+        for M in M_LIST:
+            global_rank = dist.get_rank()
+            try:
+                mat = torch.ones(world_size,
+                                 M,
+                                 dtype=getattr(
+                                     torch,
+                                     args.dtype)).to(
+                                         get_accelerator().device_name(local_rank))
+                sync_all()
+                input = ((mat.mul_(float(global_rank))).view(-1))
+            except RuntimeError as e:
+                if 'out of memory' in str(e):
+                    if dist.get_rank() == 0:
+                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
+                    sync_all()
+                    break
+            sync_all()
+            timed_pt2pt(input, args)
+    else:
+        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
+        # Don't need output tensor, so double mem_factor
+        elements_per_gpu = max_numel(comm_op='pt2pt',
+                                     dtype=getattr(torch,
+                                                   args.dtype),
+                                     mem_factor=args.mem_factor * 2,
+                                     local_rank=local_rank,
+                                     args=args)
+        try:
+            mat = torch.ones(elements_per_gpu,
+                             dtype=getattr(torch,
+                                           args.dtype)).to(
+                                               get_accelerator().device_name(local_rank))
+            input = ((mat.mul_(float(global_rank))).view(-1))
+        except RuntimeError as e:
+            if 'out of memory' in str(e):
+                if dist.get_rank() == 0:
+                    print(
+                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
+                    )
+                sync_all()
+                return
+        sync_all()
+        timed_pt2pt(input, args)
+
+
+if __name__ == "__main__":
+    args = benchmark_parser().parse_args()
+    rank = args.local_rank
+    init_processes(local_rank=rank, args=args)
+    run_pt2pt(local_rank=rank, args=args)
diff --git a/benchmarks/communication/run_all.py b/benchmarks/communication/run_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ec562cc9ae0dcc101477117a6158b54a1f4272a
--- /dev/null
+++ b/benchmarks/communication/run_all.py
@@ -0,0 +1,49 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from benchmarks.communication.utils import *
+from benchmarks.communication.all_reduce import run_all_reduce
+from benchmarks.communication.all_gather import run_all_gather
+from benchmarks.communication.all_to_all import run_all_to_all
+from benchmarks.communication.pt2pt import run_pt2pt
+from benchmarks.communication.broadcast import run_broadcast
+from benchmarks.communication.constants import *
+
+
+# For importing
+def main(args, rank):
+
+    init_processes(local_rank=rank, args=args)
+
+    ops_to_run = []
+    if args.all_reduce:
+        ops_to_run.append('all_reduce')
+    if args.all_gather:
+        ops_to_run.append('all_gather')
+    if args.broadcast:
+        ops_to_run.append('broadcast')
+    if args.pt2pt:
+        ops_to_run.append('pt2pt')
+    if args.all_to_all:
+        ops_to_run.append('all_to_all')
+
+    if len(ops_to_run) == 0:
+        ops_to_run = ['all_reduce', 'all_gather', 'all_to_all', 'broadcast', 'pt2pt']
+
+    for comm_op in ops_to_run:
+        if comm_op == 'all_reduce':
+            run_all_reduce(local_rank=rank, args=args)
+        if comm_op == 'all_gather':
+            run_all_gather(local_rank=rank, args=args)
+        if comm_op == 'all_to_all':
+            run_all_to_all(local_rank=rank, args=args)
+        if comm_op == 'pt2pt':
+            run_pt2pt(local_rank=rank, args=args)
+        if comm_op == 'broadcast':
+            run_broadcast(local_rank=rank, args=args)
+
+
+# For directly calling benchmark
+if __name__ == "__main__":
+    args = benchmark_parser().parse_args()
+    rank = args.local_rank
+    main(args, rank)
diff --git a/benchmarks/communication/utils.py b/benchmarks/communication/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b913dda14fe552cd7dc2f9fb46d878f5366d4c2a
--- /dev/null
+++ b/benchmarks/communication/utils.py
@@ -0,0 +1,220 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import os
+import math
+import argparse
+from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
+
+global dist
+
+
+def init_torch_distributed(backend):
+    global dist
+    import torch.distributed as dist
+    torch.distributed.init_process_group(backend)
+    local_rank = int(os.environ['LOCAL_RANK'])
+    get_accelerator().set_device(local_rank)
+
+
+def init_deepspeed_comm(backend):
+    global dist
+    import deepspeed
+    import deepspeed.comm as dist
+    deepspeed.init_distributed(dist_backend=backend)
+    local_rank = int(os.environ['LOCAL_RANK'])
+    get_accelerator().set_device(local_rank)
+
+
+def init_processes(local_rank, args):
+    if args.dist == 'deepspeed':
+        init_deepspeed_comm(args.backend)
+    elif args.dist == 'torch':
+        init_torch_distributed(args.backend)
+    else:
+        print_rank_0(f"distributed framework {args.dist} not supported")
+        exit(0)
+
+
+def print_rank_0(message):
+    if dist.get_rank() == 0:
+        print(message)
+
+
+def print_header(args, comm_op):
+    if comm_op == 'pt2pt':
+        world_size = 2
+    else:
+        world_size = dist.get_world_size()
+    tput = f'Throughput ({args.bw_unit})'
+    busbw = f'BusBW ({args.bw_unit})'
+    header = f"\n---- Performance of {comm_op} on {world_size} devices ---------------------------------------------------------\n"
+    duration_str = 'Duration'
+    if args.raw:
+        duration_str += ' (us)'
+    header += f"{'Size (Bytes)':20s} {'Description':25s} {duration_str:20s} {tput:20s} {busbw:20s}\n"
+    header += "----------------------------------------------------------------------------------------------------"
+    print_rank_0(header)
+
+
+def get_bw(comm_op, size, duration, args):
+    n = dist.get_world_size()
+    tput = 0
+    busbw = 0
+    if comm_op == "all_to_all":
+        tput = (size / duration)
+        busbw = (size / duration) * ((n - 1) / n)
+    elif comm_op == "all_gather":
+        size *= n
+        tput = (size / duration)
+        busbw = (size / duration) * ((n - 1) / n)
+    elif comm_op == "all_reduce":
+        tput = (size * 2 / duration)
+        busbw = (size / duration) * (2 * (n - 1) / n)
+    elif comm_op == "pt2pt" or comm_op == "broadcast":
+        tput = (size / duration)
+        busbw = tput
+    else:
+        print_rank_0("wrong comm_op specified")
+        exit(0)
+
+    if args.bw_unit == 'Gbps':
+        tput *= 8
+        busbw *= 8
+
+    return tput, busbw
+
+
+def get_metric_strings(args, tput, busbw, duration):
+    duration_ms = duration * 1e3
+    duration_us = duration * 1e6
+    tput = f'{tput / 1e9:.3f}'
+    busbw = f'{busbw /1e9:.3f}'
+
+    if duration_us < 1e3 or args.raw:
+        duration = f'{duration_us:.3f}'
+        if not args.raw:
+            duration += ' us'
+    else:
+        duration = f'{duration_ms:.3f} ms'
+    return tput, busbw, duration
+
+
+def sync_all():
+    get_accelerator().synchronize()
+    dist.barrier()
+
+
+def max_numel(comm_op, dtype, mem_factor, local_rank, args):
+    dtype_size = _element_size(dtype)
+    max_memory_per_gpu = get_accelerator().total_memory(local_rank) * mem_factor
+    if comm_op == 'all_reduce' or comm_op == 'pt2pt' or comm_op == 'broadcast':
+        elements_per_gpu = int(max_memory_per_gpu // dtype_size)
+    elif comm_op == 'all_gather':
+        # all_gather performance is lower for non-powers of two, and the output buffer size scales with world size
+        # Therefore, divide by world size and round down to nearest power of 2
+        elements_per_gpu = int(max_memory_per_gpu // dtype_size // dist.get_world_size())
+        elements_per_gpu = int(pow(2, int(math.log(elements_per_gpu, 2))))
+    elif comm_op == 'all_to_all':
+        # Number of elements must be divisible by world_size
+        # all_to_all performance is lower for non-powers of two. Round down like all_gather.
+        elements_per_gpu = int(max_memory_per_gpu // dtype_size)
+        elements_per_gpu = int(dist.get_world_size() *
+                               round(elements_per_gpu / dist.get_world_size()))
+        elements_per_gpu = int(pow(2, int(math.log(elements_per_gpu, 2))))
+    else:
+        print(f"This communication operation: {comm_op} is not supported yet")
+        exit(0)
+    return elements_per_gpu
+
+
+# Helper function to pretty-print message sizes
+def convert_size(size_bytes):
+    if size_bytes == 0:
+        return "0B"
+    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
+    i = int(math.floor(math.log(size_bytes, 1024)))
+    p = math.pow(1024, i)
+    s = round(size_bytes / p, 2)
+    return "%s %s" % (s, size_name[i])
+
+
+# Copied from torch. Need to add the func here for old torch compatibility.
+def _element_size(dtype):
+    """
+    Returns the element size for a dtype, in bytes
+    """
+    if not isinstance(dtype, torch.dtype):
+        raise RuntimeError(f'expected torch.dtype, but got {type(dtype)}')
+
+    if dtype.is_complex:
+        return torch.finfo(dtype).bits >> 2
+    elif dtype.is_floating_point:
+        return torch.finfo(dtype).bits >> 3
+    elif dtype == torch.bool:
+        # NOTE: torch.bool is not supported in torch.iinfo()
+        return 1
+    else:
+        return torch.iinfo(dtype).bits >> 3
+
+
+def benchmark_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_rank", type=int)
+    parser.add_argument("--trials",
+                        type=int,
+                        default=DEFAULT_TRIALS,
+                        help='Number of timed iterations')
+    parser.add_argument("--warmups",
+                        type=int,
+                        default=DEFAULT_WARMUPS,
+                        help='Number of warmup (non-timed) iterations')
+    parser.add_argument("--maxsize",
+                        type=int,
+                        default=24,
+                        help='Max message size as a power of 2')
+    parser.add_argument("--async-op",
+                        action="store_true",
+                        help='Enables non-blocking communication')
+    parser.add_argument("--bw-unit",
+                        type=str,
+                        default=DEFAULT_UNIT,
+                        choices=['Gbps',
+                                 'GBps'])
+    parser.add_argument("--backend",
+                        type=str,
+                        default=DEFAULT_BACKEND,
+                        choices=['nccl',
+                                 'ccl'],
+                        help='Communication library to use')
+    parser.add_argument("--dist",
+                        type=str,
+                        default=DEFAULT_DIST,
+                        choices=['deepspeed',
+                                 'torch'],
+                        help='Distributed DL framework to use')
+    parser.add_argument("--scan",
+                        action="store_true",
+                        help='Enables scanning all message sizes')
+    parser.add_argument("--raw",
+                        action="store_true",
+                        help='Print the message size and latency without units')
+    parser.add_argument("--all-reduce", action="store_true", help='Run all_reduce')
+    parser.add_argument("--all-gather", action="store_true", help='Run all_gather')
+    parser.add_argument("--all-to-all", action="store_true", help='Run all_to_all')
+    parser.add_argument("--pt2pt", action="store_true", help='Run pt2pt')
+    parser.add_argument("--broadcast", action="store_true", help='Run broadcast')
+    parser.add_argument("--dtype",
+                        type=str,
+                        default=DEFAULT_TYPE,
+                        help='PyTorch tensor dtype')
+    parser.add_argument(
+        "--mem-factor",
+        type=float,
+        default=.4,
+        help='Proportion of max available GPU memory to use for single-size evals')
+    parser.add_argument("--debug",
+                        action="store_true",
+                        help='Enables all_to_all debug prints')
+    return parser
diff --git a/benchmarks/inference/bert-bench.py b/benchmarks/inference/bert-bench.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d586d033cd7b375f5e82dfef53445199846630f
--- /dev/null
+++ b/benchmarks/inference/bert-bench.py
@@ -0,0 +1,92 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import time
+import deepspeed
+import argparse
+from transformers import pipeline
+from deepspeed.accelerator import get_accelerator
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model", "-m", type=str, help="hf model name")
+parser.add_argument("--deepspeed", action="store_true", help="use deepspeed inference")
+parser.add_argument("--dtype", type=str, default="fp16", help="fp16 or fp32")
+parser.add_argument("--max-tokens", type=int, default=50, help="max new tokens")
+parser.add_argument("--local_rank", type=int, default=0, help="local rank")
+parser.add_argument("--trials", type=int, default=30, help="number of trials")
+parser.add_argument("--kernel-inject", action="store_true", help="inject kernels on")
+parser.add_argument("--graphs", action="store_true", help="CUDA Graphs on")
+args = parser.parse_args()
+
+
+def print_latency(latency_set, title, warmup=3):
+    # trim warmup queries
+    latency_set = latency_set[warmup:]
+    count = len(latency_set)
+    if count > 0:
+        latency_set.sort()
+        n50 = (count - 1) * 0.5 + 1
+        n90 = (count - 1) * 0.9 + 1
+        n95 = (count - 1) * 0.95 + 1
+        n99 = (count - 1) * 0.99 + 1
+        n999 = (count - 1) * 0.999 + 1
+
+        avg = sum(latency_set) / count
+        p50 = latency_set[int(n50) - 1]
+        p90 = latency_set[int(n90) - 1]
+        p95 = latency_set[int(n95) - 1]
+        p99 = latency_set[int(n99) - 1]
+        p999 = latency_set[int(n999) - 1]
+
+        print(f"====== latency stats {title} ======")
+        print("\tAvg Latency: {0:8.2f} ms".format(avg * 1000))
+        print("\tP50 Latency: {0:8.2f} ms".format(p50 * 1000))
+        print("\tP90 Latency: {0:8.2f} ms".format(p90 * 1000))
+        print("\tP95 Latency: {0:8.2f} ms".format(p95 * 1000))
+        print("\tP99 Latency: {0:8.2f} ms".format(p99 * 1000))
+        print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))
+
+
+deepspeed.init_distributed()
+
+print(args.model, args.max_tokens, args.dtype)
+
+if args.dtype.lower() == "fp16":
+    dtype = torch.float16
+else:
+    dtype = torch.float32
+
+pipe = pipeline("fill-mask", model=args.model, framework="pt", device=args.local_rank)
+
+if dtype == torch.half:
+    pipe.model.half()
+
+mask = pipe.tokenizer.mask_token
+
+br = pipe(f"Hello I'm a {mask} model")
+if args.deepspeed:
+    pipe.model = deepspeed.init_inference(pipe.model,
+                                          dtype=dtype,
+                                          mp_size=1,
+                                          replace_with_kernel_inject=args.kernel_inject,
+                                          enable_cuda_graph=args.graphs)
+    pipe.model.profile_model_time()
+
+responses = []
+times = []
+mtimes = []
+for i in range(args.trials):
+    get_accelerator().synchronize()
+    start = time.time()
+    r = pipe(f"Hello I'm a {mask} model")
+    get_accelerator().synchronize()
+    end = time.time()
+    responses.append(r)
+    times.append((end - start))
+    mtimes += pipe.model.model_times()
+    #print(f"{pipe.model.model_times()=}")
+
+print_latency(times, "e2e latency")
+print_latency(mtimes, "model latency")
+
+print(responses[0:3])
diff --git a/benchmarks/inference/collect_results.py b/benchmarks/inference/collect_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e51033114db848d2d2ff14b2f33b009a2090672
--- /dev/null
+++ b/benchmarks/inference/collect_results.py
@@ -0,0 +1,147 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
+import re
+import argparse
+import pandas as pd
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--results-dir",
+    "-r",
+    type=str,
+    default="./results",
+    help="directory containing sweep results",
+)
+parser.add_argument("--version",
+                    "-v",
+                    type=int,
+                    default=0,
+                    help="version to be collected")
+parser.add_argument("--gen-text-n",
+                    "-n",
+                    type=int,
+                    default=1,
+                    help="expected number of generated text")
+parser.add_argument("--output",
+                    "-o",
+                    type=str,
+                    default="./results.csv",
+                    help="output file")
+args = parser.parse_args()
+
+
+def get_branch(file_path):
+    match = re.match(r".*\/(.*)\.log", file_path)
+    if match is None:
+        return False
+    else:
+        return match.groups()[0]
+
+
+def get_benchmark_params(root_dir, file_path):
+    match = re.match(
+        rf"{root_dir}\/(.+?)_(fp\d+)_(true|false)_(true|false)_(\d+)gpus_v(\d+)\/",
+        file_path,
+    )
+    if match is None:
+        return False
+    else:
+        model, dtype, graphs, kernel, gpus, version = match.groups()
+        bool_dict = {"true": True, "false": False}
+        return {
+            "model": model,
+            "dtype": dtype,
+            "graphs": bool_dict[graphs.lower()],
+            "kernel": bool_dict[kernel.lower()],
+            "gpus": int(gpus),
+            "version": int(version),
+        }
+
+
+def get_perf_data(file_content):
+    matches = re.findall(r"\s+(.+?)\sLatency:\s+(\d+\.\d+)\sms", file_content)
+    if matches is []:
+        return False
+    else:
+        return {f"latency-{key}": float(val) for key, val in matches}
+
+
+def get_generated_text(file_content, gen_text_n):
+    file_content = file_content.replace("\n", " ")
+    file_content = file_content.replace("\t", " ")
+    matches = re.findall(r"RESPONSE\s(\d+):\s+[-]{30}\s+(.+?)\s+[-]{30}", file_content)
+    if len(matches) != gen_text_n:
+        return False
+    else:
+        return {f"generated-text-{key}": val for key, val in matches}
+
+
+def get_error(file_content):
+    matches = re.findall(r"Error:\s+(.+?)\n", file_content)
+    if matches is []:
+        return False
+    else:
+        return {f"error": val for val in matches}
+
+
+if __name__ == "__main__":
+    # List to collect data from all benchmarks
+    benchmarks_data = []
+
+    # Walk through directory of results from sweep.sh
+    for root, dirs, files in os.walk(args.results_dir):
+        # Because of how some models are named, the dir structure for results can vary, e.g.:
+        # "EleutherAI/gpt-neo_*/baseline.log" versus "gpt2_*/baseline.log"
+        if dirs:
+            continue
+
+        # Get data from baseline and each tested branch
+        for name in files:
+            file_path = os.path.join(root, name)
+
+            branch = get_branch(file_path)
+            if not branch:
+                print(f"WARNING: Could not detect branch for file {file_path}, skipping")
+                continue
+
+            params = get_benchmark_params(args.results_dir, file_path)
+            if not params:
+                print(
+                    f"WARNING: Could not detect benchmark settings for file {file_path}, skipping"
+                )
+                continue
+
+            # Verify that the version matches that which we want to collect
+            if params["version"] != args.version:
+                continue
+
+            with open(file_path, "r") as f:
+                file_content = f.read()
+
+            perf_data = get_perf_data(file_content)
+            if not perf_data:
+                print(
+                    f"WARNING: Could not detect benchmark performance data for file {file_path}"
+                )
+
+            generated_text = get_generated_text(file_content, args.gen_text_n)
+            if not generated_text:
+                print(f"WARNING: Could not detect generated text for file {file_path}")
+
+            error = get_error(file_content)
+            if error:
+                print(f"Error found in {file_path}, collecting error info...")
+                benchmarks_data.append({"branch": branch, **params, **error})
+                continue
+
+            benchmarks_data.append({
+                "branch": branch,
+                **params,
+                **perf_data,
+                **generated_text
+            })
+
+    # Convert to a DataFrame and save
+    benchmarks_df = pd.DataFrame(benchmarks_data)
+    benchmarks_df.to_csv(args.output)
diff --git a/benchmarks/inference/gpt-bench.py b/benchmarks/inference/gpt-bench.py
new file mode 100644
index 0000000000000000000000000000000000000000..29578b30cf1faf1638bbd0a84865ee9c283e9443
--- /dev/null
+++ b/benchmarks/inference/gpt-bench.py
@@ -0,0 +1,124 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
+import torch
+import time
+import deepspeed
+import argparse
+from transformers import pipeline
+from deepspeed.accelerator import get_accelerator
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model", "-m", type=str, help="hf model name")
+parser.add_argument("--deepspeed", action="store_true", help="use deepspeed inference")
+parser.add_argument("--dtype",
+                    type=str,
+                    default="fp16",
+                    choices=["fp16",
+                             "fp32",
+                             "int8"],
+                    help="int8, fp16, or fp32")
+parser.add_argument("--graphs", action="store_true", help="CUDA Graphs on")
+parser.add_argument("--kernel-inject", action="store_true", help="inject kernels on")
+parser.add_argument("--max-tokens", type=int, default=50, help="max new tokens")
+parser.add_argument("--local_rank",
+                    type=int,
+                    default=int(os.getenv("LOCAL_RANK",
+                                          "0")),
+                    help="local rank")
+parser.add_argument("--world_size",
+                    type=int,
+                    default=int(os.getenv("WORLD_SIZE",
+                                          "1")),
+                    help="world size")
+parser.add_argument("--trials", type=int, default=30, help="number of trials")
+args = parser.parse_args()
+
+
+def print_latency(latency_set, title, warmup=3):
+    # trim warmup queries
+    latency_set = list(latency_set)
+    latency_set = latency_set[warmup:]
+    count = len(latency_set)
+    if count > 0:
+        latency_set.sort()
+        n50 = (count - 1) * 0.5 + 1
+        n90 = (count - 1) * 0.9 + 1
+        n95 = (count - 1) * 0.95 + 1
+        n99 = (count - 1) * 0.99 + 1
+        n999 = (count - 1) * 0.999 + 1
+
+        avg = sum(latency_set) / count
+        p50 = latency_set[int(n50) - 1]
+        p90 = latency_set[int(n90) - 1]
+        p95 = latency_set[int(n95) - 1]
+        p99 = latency_set[int(n99) - 1]
+        p999 = latency_set[int(n999) - 1]
+
+        print(f"====== latency stats {title} ======")
+        print("\tAvg Latency: {0:8.2f} ms".format(avg * 1000))
+        print("\tP50 Latency: {0:8.2f} ms".format(p50 * 1000))
+        print("\tP90 Latency: {0:8.2f} ms".format(p90 * 1000))
+        print("\tP95 Latency: {0:8.2f} ms".format(p95 * 1000))
+        print("\tP99 Latency: {0:8.2f} ms".format(p99 * 1000))
+        print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))
+
+
+deepspeed.init_distributed()
+
+if args.local_rank == 0:
+    print("BENCHMARK SETTINGS:")
+    print(f"\tMODEL: {args.model}")
+    print(f"\tMAX_TOKENS: {args.max_tokens}")
+    print(f"\tDTYPE: {args.dtype}")
+    print(f"\tCUDA_GRAPHS: {args.graphs}")
+    print(f"\tKERNEL_INJECT: {args.kernel_inject}")
+
+if args.dtype == "int8":
+    dtype = torch.int8
+elif args.dtype == "fp16":
+    dtype = torch.float16
+else:
+    dtype = torch.float32
+
+pipe = pipeline("text-generation",
+                model=args.model,
+                framework="pt",
+                device=args.local_rank)
+
+if dtype == torch.float16:
+    pipe.model.half()
+
+if args.deepspeed:
+    pipe.model = deepspeed.init_inference(
+        pipe.model,
+        dtype=dtype,
+        mp_size=args.world_size,
+        replace_with_kernel_inject=args.kernel_inject,
+        enable_cuda_graph=args.graphs,
+    )
+    pipe.model.profile_model_time()
+
+responses = []
+times = []
+mtimes = []
+for i in range(args.trials):
+    get_accelerator().synchronize()
+    start = time.time()
+    r = pipe("DeepSpeed is", do_sample=False, max_new_tokens=args.max_tokens)
+    get_accelerator().synchronize()
+    end = time.time()
+    responses.append(r)
+    times.append(end - start)  # / (args.max_tokens - 3))
+    mtimes.append(sum(pipe.model.model_times()))
+
+if args.local_rank == 0:
+    print_latency(times, "(e2e) latency")
+    print_latency(mtimes, "(model-only) latency")
+    print_latency(map(lambda t: t / (args.max_tokens - 3),
+                      times),
+                  "(e2e) per token latency")
+    print(f"RESPONSE 0:")
+    print("-" * 30)
+    print(responses[0][0]["generated_text"])
+    print("-" * 30)
diff --git a/benchmarks/inference/requirements.txt b/benchmarks/inference/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..00899dd5f4858229e4115fd2b80b7807636892bd
--- /dev/null
+++ b/benchmarks/inference/requirements.txt
@@ -0,0 +1 @@
+transformers>=4.21.3
diff --git a/benchmarks/inference/run_model.sh b/benchmarks/inference/run_model.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8e5fe3ac0133150a5de05f76da4951a2ead6be58
--- /dev/null
+++ b/benchmarks/inference/run_model.sh
@@ -0,0 +1,36 @@
+set -x
+
+model=$1
+branch1=$2
+branch2=$3
+dtype=$4
+graphs=$5
+kernel=$6
+gpus=$7
+
+version=0
+log_path=results/${model}_${dtype}_${graphs}_${kernel}_${gpus}gpus_v${version}
+mkdir -p ${log_path}
+
+params="--dtype $dtype "
+if [[ "$graphs" == "true" ]]; then
+    params+="--graphs "
+fi
+if [[ "$kernel" == "true" ]]; then
+    params+="--kernel "
+fi
+
+echo "baseline $log_path"
+deepspeed --num_gpus 1 gpt-bench.py -m "${model}" $params &> ${log_path}/baseline.log
+
+cd ../../
+git checkout ${branch1}
+cd -
+echo "ds ${branch1} $log_path"
+deepspeed --num_gpus $gpus gpt-bench.py --deepspeed -m "${model}" $params &> ${log_path}/ds-${branch1}.log
+
+cd ../../
+git checkout ${branch2}
+cd -
+echo "ds ${branch2} $log_path"
+deepspeed --num_gpus $gpus gpt-bench.py --deepspeed -m "${model}" $params&> ${log_path}/ds-${branch2}.log
diff --git a/benchmarks/inference/sweep.sh b/benchmarks/inference/sweep.sh
new file mode 100644
index 0000000000000000000000000000000000000000..aabcb0bfdbd89e2eedd97d3e6afa74c0e50e7803
--- /dev/null
+++ b/benchmarks/inference/sweep.sh
@@ -0,0 +1,41 @@
+set -x
+
+export TRANSFORMERS_CACHE=/tmp/hf-cache
+
+branch1=$1
+branch2=$2
+
+gptneo_models="EleutherAI/gpt-neo-2.7B EleutherAI/gpt-neo-1.3B EleutherAI/gpt-neo-125M"
+gpt2_models="gpt2 gpt2-large gpt2-xl"
+gptj_models="EleutherAI/gpt-j-6B"
+opt_models="facebook/opt-125m facebook/opt-1.3b facebook/opt-2.7b facebook/opt-6.7b facebook/opt-13b"
+bloom_models="bigscience/bloom-560m bigscience/bloom-1b7 bigscience/bloom-3b bigscience/bloom-7b1"
+
+for gpus in `echo "1 2 4 8"`; do
+    for dtype in `echo "fp16 fp32"`; do
+        for graphs in `echo "true false"`; do
+            for kernel in `echo "true false"`; do
+                params="$dtype $graphs $kernel $gpus"
+                for m in `echo "$gptneo_models"`; do
+                  bash run_model.sh $m $branch1 $branch2 $params
+                done
+
+                for m in `echo "$gpt2_models"`; do
+                  bash run_model.sh $m $branch1 $branch2 $params
+                done
+
+                for m in `echo "$gptj_models"`; do
+                  bash run_model.sh $m $branch1 $branch2 $params
+                done
+
+                for m in `echo "$opt_models"`; do
+                  bash run_model.sh $m $branch1 $branch2 $params
+                done
+
+                for m in `echo "$bloom_models"`; do
+                  bash run_model.sh $m $branch1 $branch2 $params
+                done
+            done
+        done
+    done
+done
diff --git a/bin/deepspeed b/bin/deepspeed
deleted file mode 100644
index 5ec8820db922fcdb284ff18cbe7f21c3b2e4d38b..0000000000000000000000000000000000000000
--- a/bin/deepspeed
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env python3
-
-from deepspeed.launcher.runner import main
-
-if __name__ == '__main__':
-    main()
diff --git a/bin/deepspeed b/bin/deepspeed
new file mode 120000
index 0000000000000000000000000000000000000000..6b768564101983015fd56c8d604e439c2374ad06
--- /dev/null
+++ b/bin/deepspeed
@@ -0,0 +1 @@
+ds
\ No newline at end of file
diff --git a/bin/deepspeed.pt b/bin/deepspeed.pt
deleted file mode 100644
index 5ec8820db922fcdb284ff18cbe7f21c3b2e4d38b..0000000000000000000000000000000000000000
--- a/bin/deepspeed.pt
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env python3
-
-from deepspeed.launcher.runner import main
-
-if __name__ == '__main__':
-    main()
diff --git a/bin/deepspeed.pt b/bin/deepspeed.pt
new file mode 120000
index 0000000000000000000000000000000000000000..6b768564101983015fd56c8d604e439c2374ad06
--- /dev/null
+++ b/bin/deepspeed.pt
@@ -0,0 +1 @@
+ds
\ No newline at end of file
diff --git a/bin/ds b/bin/ds
old mode 100644
new mode 100755
diff --git a/bin/ds_bench b/bin/ds_bench
new file mode 100755
index 0000000000000000000000000000000000000000..bfacbc8e25c8d21958a7cd44572f7177d5ff3cb5
--- /dev/null
+++ b/bin/ds_bench
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+from benchmarks.communication.run_all import main
+from benchmarks.communication.constants import *
+from benchmarks.communication.utils import *
+import os
+import sys
+
+# Run the same file with deepspeed launcher. This is required since setuptools will auto-detect python files and insert a python shebang for both 'scripts' and 'entry_points', and this benchmarks require the DS launcher
+required_env = ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
+if not all(map(lambda v: v in os.environ, required_env)):
+    import subprocess
+    subprocess.run("deepspeed $(which ds_bench) " + " ".join(sys.argv[1:]), shell=True)
+else:
+    args = benchmark_parser().parse_args()
+    rank = args.local_rank
+    main(args, rank)
diff --git a/bin/ds_elastic b/bin/ds_elastic
old mode 100644
new mode 100755
diff --git a/bin/ds_report b/bin/ds_report
old mode 100644
new mode 100755
diff --git a/bin/ds_ssh b/bin/ds_ssh
old mode 100644
new mode 100755
diff --git a/bin/dsr b/bin/dsr
new file mode 120000
index 0000000000000000000000000000000000000000..747bf4722c429a1e845cc1fc7527e249bec6af2c
--- /dev/null
+++ b/bin/dsr
@@ -0,0 +1 @@
+ds_report
\ No newline at end of file
diff --git a/build_win.bat b/build_win.bat
new file mode 100644
index 0000000000000000000000000000000000000000..ec8c8a362a783df677862d8b500ade56896422de
--- /dev/null
+++ b/build_win.bat
@@ -0,0 +1,19 @@
+@echo off
+
+set DS_BUILD_AIO=0
+set DS_BUILD_SPARSE_ATTN=0
+
+echo Administrative permissions required. Detecting permissions...
+
+net session >nul 2>&1
+if %errorLevel% == 0 (
+    echo Success: Administrative permissions confirmed.
+) else (
+    echo Failure: Current permissions inadequate.
+    goto end
+)
+
+
+python setup.py bdist_wheel
+
+:end
diff --git a/csrc/adagrad/cpu_adagrad.cpp b/csrc/adagrad/cpu_adagrad.cpp
index 4f2a9b69ef966599d1bd6664f79e312c9240671b..9f8f95c4a876fdd5883cceea70f809d46544e994 100644
--- a/csrc/adagrad/cpu_adagrad.cpp
+++ b/csrc/adagrad/cpu_adagrad.cpp
@@ -1,16 +1,21 @@
+#ifdef __HIPCC__
+#include "cpu_adagrad_hip.h"
+#else
 #include "cpu_adagrad.h"
-#include <cuda_runtime_api.h>
-#include <math.h>
-#include <omp.h>
+#endif
+
 #include <torch/extension.h>
 #include <iostream>
 #include <memory>
 #include <type_traits>
 #include <unordered_map>
+#if defined(__ENABLE_CUDA__)
+#include <cuda_runtime_api.h>
 #include "cublas_v2.h"
 #include "cuda.h"
 #include "curand.h"
 #include "custom_cuda_layers.h"
+#endif
 
 static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
 
@@ -20,7 +25,7 @@ void Adagrad_Optimizer::Step_1(float* _params,
                                float* grads,
                                float* _exp_avg_sq,
                                size_t _param_size,
-                               __half* dev_params,
+                               ds_half_precision_t* dev_params,
                                bool half_precision)
 {
     size_t rounded_size = 0;
@@ -30,17 +35,19 @@ void Adagrad_Optimizer::Step_1(float* _params,
 #endif
     if (_param_size > rounded_size) {
         float step_size = -1 * _alpha;
-        __half* grads_cast_h;
-        __half* params_cast_h;
+        ds_half_precision_t* grads_cast_h;
+        ds_half_precision_t* params_cast_h;
         if (half_precision) {
-            grads_cast_h = reinterpret_cast<__half*>(grads);
-            params_cast_h = reinterpret_cast<__half*>(_params);
+            grads_cast_h = reinterpret_cast<ds_half_precision_t*>(grads);
+            params_cast_h = reinterpret_cast<ds_half_precision_t*>(_params);
         }
         for (size_t t = rounded_size; t < _param_size; t += TILE) {
             size_t copy_size = TILE;
             if ((t + TILE) > _param_size) copy_size = _param_size - t;
             size_t offset = copy_size + t;
+#if defined(__ENABLE_CUDA__)
             if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+#endif
 #pragma omp parallel for
             for (size_t k = t; k < offset; k++) {
                 float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
@@ -55,21 +62,24 @@ void Adagrad_Optimizer::Step_1(float* _params,
                 grad += _eps;
                 grad = momentum / grad;
                 param = grad * step_size + param;
+#if defined(__ENABLE_CUDA__)
                 if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
-
+#endif
                 if (half_precision)
-                    params_cast_h[k] = (__half)param;
+                    params_cast_h[k] = (ds_half_precision_t)param;
                 else
                     _params[k] = param;
                 // STORE UPDATE TERM TO GRAD'S MEMORY
                 grads[k] = grad * step_size;
                 _exp_avg_sq[k] = variance;
             }
+#if defined(__ENABLE_CUDA__)
             if (dev_params) {
                 launch_param_update(
                     _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
                 _buf_index = !_buf_index;
             }
+#endif
         }
     }
 }
@@ -78,7 +88,7 @@ void Adagrad_Optimizer::Step_4(float* _params,
                                float* grads,
                                float* _exp_avg_sq,
                                size_t _param_size,
-                               __half* dev_params,
+                               ds_half_precision_t* dev_params,
                                bool half_precision)
 {
     size_t rounded_size = 0;
@@ -130,7 +140,7 @@ void Adagrad_Optimizer::Step_8(float* _params,
                                float* grads,
                                float* _exp_avg_sq,
                                size_t _param_size,
-                               __half* dev_params,
+                               ds_half_precision_t* dev_params,
                                bool half_precision)
 {
     size_t rounded_size = 0;
@@ -170,7 +180,9 @@ int ds_adagrad_step(int optimizer_id,
     opt->update_state(lr, epsilon, weight_decay);
     opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.size(0));
 
+#if defined(__ENABLE_CUDA__)
     opt->SynchronizeStreams();
+#endif
     return 0;
 }
 
@@ -184,6 +196,7 @@ int ds_adagrad_step_plus_copy(int optimizer_id,
                               torch::Tensor& exp_avg_sq,
                               torch::Tensor& gpu_params)
 {
+#if defined(__ENABLE_CUDA__)
     auto params_c = params.contiguous();
     auto gpu_params_c = gpu_params.contiguous();
     auto exp_avg_sq_c = exp_avg_sq.contiguous();
@@ -191,7 +204,7 @@ int ds_adagrad_step_plus_copy(int optimizer_id,
 
     float* params_ptr = (float*)params_c.data_ptr();
     float* grads_ptr = (float*)grads_c.data_ptr();
-    __half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
+    ds_half_precision_t* gpu_params_ptr = (ds_half_precision_t*)gpu_params_c.data_ptr();
     float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
 
     std::shared_ptr<Adagrad_Optimizer> opt =
@@ -206,6 +219,9 @@ int ds_adagrad_step_plus_copy(int optimizer_id,
                 (params.options().dtype() == at::kHalf));
 
     opt->SynchronizeStreams();
+#else
+    assert(false);
+#endif
     return 0;
 }
 
diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
index 727eec8182c12cdcc0dcb0df53a38918bdeae4b8..f17f22535ab8dfd56260daff7a2479e771f376a4 100644
--- a/csrc/adam/cpu_adam.cpp
+++ b/csrc/adam/cpu_adam.cpp
@@ -1,16 +1,18 @@
 #include "cpu_adam.h"
-#include <cuda_runtime_api.h>
-#include <math.h>
-#include <omp.h>
 #include <torch/extension.h>
+#include <cassert>
 #include <iostream>
 #include <memory>
 #include <type_traits>
 #include <unordered_map>
+
+#if defined(__ENABLE_CUDA__)
+#include <cuda_runtime_api.h>
 #include "cublas_v2.h"
 #include "cuda.h"
 #include "curand.h"
 #include "custom_cuda_layers.h"
+#endif
 
 static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
 
@@ -21,7 +23,7 @@ void Adam_Optimizer::Step_1(float* _params,
                             float* _exp_avg,
                             float* _exp_avg_sq,
                             size_t _param_size,
-                            __half* dev_params,
+                            ds_half_precision_t* dev_params,
                             bool half_precision)
 {
     size_t rounded_size = 0;
@@ -41,19 +43,20 @@ void Adam_Optimizer::Step_1(float* _params,
 
         float step_size = -1 * _alpha / _bias_correction1;
         float w_decay = -1 * _alpha * _weight_decay;
-        __half* grads_cast_h;
-        __half* params_cast_h;
+        ds_half_precision_t* grads_cast_h;
+        ds_half_precision_t* params_cast_h;
         if (half_precision) {
-            grads_cast_h = reinterpret_cast<__half*>(grads);
-            params_cast_h = reinterpret_cast<__half*>(_params);
+            grads_cast_h = reinterpret_cast<ds_half_precision_t*>(grads);
+            params_cast_h = reinterpret_cast<ds_half_precision_t*>(_params);
         }
 
         for (size_t t = rounded_size; t < _param_size; t += TILE) {
             size_t copy_size = TILE;
             if ((t + TILE) > _param_size) copy_size = _param_size - t;
             size_t offset = copy_size + t;
+#if defined(__ENABLE_CUDA__)
             if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-
+#endif
 #pragma omp parallel for
             for (size_t k = t; k < offset; k++) {
                 float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
@@ -73,21 +76,24 @@ void Adam_Optimizer::Step_1(float* _params,
                 grad = momentum / grad;
                 if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
                 param = grad * step_size + param;
+#if defined(__ENABLE_CUDA__)
                 if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
-
+#endif
                 if (half_precision)
-                    params_cast_h[k] = (__half)param;
+                    params_cast_h[k] = (ds_half_precision_t)param;
                 else
                     _params[k] = param;
                 _exp_avg[k] = momentum;
                 _exp_avg_sq[k] = variance;
             }
+#if defined(__ENABLE_CUDA__)
             if (dev_params) {
                 launch_param_update(
                     _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
 
                 _buf_index = !_buf_index;
             }
+#endif
         }
     }
 }
@@ -97,7 +103,7 @@ void Adam_Optimizer::Step_4(float* _params,
                             float* _exp_avg,
                             float* _exp_avg_sq,
                             size_t _param_size,
-                            __half* dev_params,
+                            ds_half_precision_t* dev_params,
                             bool half_precision)
 {
     size_t rounded_size = 0;
@@ -166,7 +172,7 @@ void Adam_Optimizer::Step_8(float* _params,
                             float* _exp_avg,
                             float* _exp_avg_sq,
                             size_t _param_size,
-                            __half* dev_params,
+                            ds_half_precision_t* dev_params,
                             bool half_precision)
 {
     size_t rounded_size = 0;
@@ -228,7 +234,9 @@ int ds_adam_step(int optimizer_id,
                 nullptr,
                 (params.options().dtype() == at::kHalf));
 
+#if defined(__ENABLE_CUDA__)
     opt->SynchronizeStreams();
+#endif
     return 0;
 }
 
@@ -246,6 +254,7 @@ int ds_adam_step_plus_copy(int optimizer_id,
                            torch::Tensor& exp_avg_sq,
                            torch::Tensor& gpu_params)
 {
+#if defined(__ENABLE_CUDA__)
     auto params_c = params.contiguous();
     auto gpu_params_c = gpu_params.contiguous();
     auto exp_avg_c = exp_avg.contiguous();
@@ -254,7 +263,7 @@ int ds_adam_step_plus_copy(int optimizer_id,
 
     float* params_ptr = (float*)params_c.data_ptr();
     float* grads_ptr = (float*)grads_c.data_ptr();
-    __half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
+    ds_half_precision_t* gpu_params_ptr = (ds_half_precision_t*)gpu_params_c.data_ptr();
     float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
     float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
 
@@ -271,6 +280,9 @@ int ds_adam_step_plus_copy(int optimizer_id,
                 (params.options().dtype() == at::kHalf));
 
     opt->SynchronizeStreams();
+#else
+    assert(false);
+#endif
     return 0;
 }
 
diff --git a/csrc/adam/multi_tensor_adam.cu b/csrc/adam/multi_tensor_adam.cu
index 3cb9763befcea663966347f23a2d9b925a7db8bd..611d9ffbe375306d94356abfefe81b29442454d4 100644
--- a/csrc/adam/multi_tensor_adam.cu
+++ b/csrc/adam/multi_tensor_adam.cu
@@ -12,7 +12,11 @@
 
 #include <assert.h>
 
+#ifdef __HIPCC__
+#include "multi_tensor_apply_hip.cuh"
+#else
 #include "multi_tensor_apply.cuh"
+#endif
 #include "type_shim.h"
 
 #define BLOCK_SIZE 512
diff --git a/csrc/aio/common/deepspeed_aio_utils.cpp b/csrc/aio/common/deepspeed_aio_utils.cpp
index 200c7030f120366c2e2a45cb6cc20785ec4518fd..e8bf9de1125907c898e21f52e81cef2cb08b131a 100644
--- a/csrc/aio/common/deepspeed_aio_utils.cpp
+++ b/csrc/aio/common/deepspeed_aio_utils.cpp
@@ -6,6 +6,7 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
 #include <cmath>
+#include <iostream>
 
 #include "deepspeed_aio_utils.h"
 
@@ -113,8 +114,8 @@ void* ds_page_aligned_alloc(const size_t size, const bool lock)
     auto mlock_ret = mlock(ptr, size);
     if (mlock_ret != 0) {
         auto mlock_error = errno;
-        printf("mlock failed with %d %s\n", mlock_error, strerror(mlock_error));
-
+        std::cerr << "mlock failed to allocate " << size << " bytes with error no " << mlock_error
+                  << " msg " << strerror(mlock_error) << std::endl;
         free(ptr);
         return nullptr;
     }
diff --git a/csrc/aio/py_lib/deepspeed_pin_tensor.cpp b/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..20bdf5b142a63a88e3636d6375d62b30bf4fd841
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
@@ -0,0 +1,43 @@
+/*
+Copyright 2023 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for managing CPU tensors occupying page-locked memory.
+*/
+
+#include "deepspeed_pin_tensor.h"
+
+using namespace std;
+
+deepspeed_pin_tensor_t::~deepspeed_pin_tensor_t()
+{
+    for (auto iter = _locked_tensors.begin(); iter != _locked_tensors.end(); ++iter) {
+        munlock(iter->first, iter->second);
+    }
+    _locked_tensors.clear();
+}
+
+torch::Tensor deepspeed_pin_tensor_t::alloc(const size_t num_elem, const at::ScalarType& elem_type)
+{
+    const auto num_bytes = num_elem * elementSize(elem_type);
+    auto pinned_buffer = ds_page_aligned_alloc(num_bytes, true);
+    assert(nullptr != pinned_buffer);
+
+    _locked_tensors[pinned_buffer] = num_bytes;
+
+    auto options = torch::TensorOptions().dtype(elem_type).device(torch::kCPU);
+
+    return at::from_blob(pinned_buffer, static_cast<long int>(num_bytes), options);
+}
+
+bool deepspeed_pin_tensor_t::free(torch::Tensor& locked_tensor)
+{
+    auto addr = locked_tensor.data_ptr();
+    if (_locked_tensors.find(addr) != _locked_tensors.end()) {
+        munlock(addr, _locked_tensors[addr]);
+        _locked_tensors.erase(addr);
+        return true;
+    }
+
+    return false;
+}
diff --git a/csrc/aio/py_lib/deepspeed_pin_tensor.h b/csrc/aio/py_lib/deepspeed_pin_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..a421bbc8f3b1ac155348a48387f40601dfe7c429
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_pin_tensor.h
@@ -0,0 +1,24 @@
+/*
+Copyright 2023 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for managing CPU tensors occupying page-locked memory.
+TODO: Implement a full-featured manager that
+ 1. Avoid page-locked memory leaks
+ 2. Minimize page-locked memory usage by reducing internal fragmentation
+*/
+
+#include <map>
+#include "deepspeed_py_aio.h"
+
+struct deepspeed_pin_tensor_t {
+    std::map<void*, size_t> _locked_tensors;
+
+    deepspeed_pin_tensor_t() = default;
+
+    ~deepspeed_pin_tensor_t();
+
+    torch::Tensor alloc(const size_t num_elem, const at::ScalarType& elem_type);
+
+    bool free(torch::Tensor& locked_tensor);
+};
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
index 417319f8ae5ce3bead644c80c094d9df1061879a..cb81924ec7d354b4168a863f38a555a3ed139d3b 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -22,7 +22,8 @@ deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
       _overlap_events(overlap_events),
       _num_threads(num_threads),
       _aio_config(block_size, queue_depth, single_submit, overlap_events, false),
-      _num_pending_ops(0)
+      _num_pending_ops(0),
+      _pinned_tensor_mgr(new deepspeed_pin_tensor_t())
 {
     for (auto i = 0; i < num_threads; ++i) {
         _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
@@ -280,3 +281,14 @@ int deepspeed_aio_handle_t::async_pwrite(const torch::Tensor& buffer, const char
 {
     return pwrite(buffer, filename, false, true);
 }
+
+at::Tensor deepspeed_aio_handle_t::new_cpu_locked_tensor(const size_t num_elem,
+                                                         const torch::Tensor& example_tensor)
+{
+    return _pinned_tensor_mgr->alloc(num_elem, example_tensor.scalar_type());
+}
+
+bool deepspeed_aio_handle_t::free_cpu_locked_tensor(torch::Tensor& locked_tensor)
+{
+    return _pinned_tensor_mgr->free(locked_tensor);
+}
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
index 22de4c3961d29abc94517b81ff38b7224822589c..2163aafcfe89b1728ee7d8ed1421a56d648ccf8a 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
@@ -8,6 +8,7 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 #include <condition_variable>
 #include <memory>
 #include "deepspeed_aio_thread.h"
+#include "deepspeed_pin_tensor.h"
 
 struct deepspeed_aio_handle_t {
     std::unique_ptr<struct aio_context> _aio_ctxt;
@@ -19,6 +20,7 @@ struct deepspeed_aio_handle_t {
     std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
     std::vector<std::thread> _threads;
     int _num_pending_ops;
+    std::unique_ptr<struct deepspeed_pin_tensor_t> _pinned_tensor_mgr;
 
     deepspeed_aio_handle_t(const int block_size,
                            const int queue_depth,
@@ -56,6 +58,11 @@ struct deepspeed_aio_handle_t {
 
     int async_pwrite(const torch::Tensor& buffer, const char* filename);
 
+    // TODO: Make API's args to be shape and dtype.
+    torch::Tensor new_cpu_locked_tensor(const size_t num_elem, const torch::Tensor& example_tensor);
+
+    bool free_cpu_locked_tensor(torch::Tensor&);
+
     int wait();
 
     void _stop_threads();
diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp
old mode 100644
new mode 100755
index 68590581ce2d985bc5209a73d9de4f515c987c30..3c971c667874568fa1547395a73143431f9c72f8
--- a/csrc/aio/py_lib/py_ds_aio.cpp
+++ b/csrc/aio/py_lib/py_ds_aio.cpp
@@ -37,5 +37,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
         .def("async_pread", &deepspeed_aio_handle_t::async_pread)
         .def("async_pwrite", &deepspeed_aio_handle_t::async_pwrite)
 
+        .def("new_cpu_locked_tensor", &deepspeed_aio_handle_t::new_cpu_locked_tensor)
+        .def("free_cpu_locked_tensor", &deepspeed_aio_handle_t::free_cpu_locked_tensor)
+
         .def("wait", &deepspeed_aio_handle_t::wait);
 }
diff --git a/csrc/aio/py_test/aio_bench_perf_sweep.py b/csrc/aio/py_test/aio_bench_perf_sweep.py
index be6cd74f7ac6fff899a56a80d9e3969013243c12..eebea69b1bbf3963295eaf7429905cd9546011d1 100644
--- a/csrc/aio/py_test/aio_bench_perf_sweep.py
+++ b/csrc/aio/py_test/aio_bench_perf_sweep.py
@@ -15,6 +15,7 @@ import shutil
 from test_ds_aio_utils import refine_integer_value
 from perf_sweep_utils import READ_OP_DESC, WRITE_OP_DESC, BENCH_LOG_DIR, \
     READ_IO_DIR, WRITE_IO_DIR, READ_LOG_DIR, WRITE_LOG_DIR
+from deepspeed.ops.op_builder import AsyncIOBuilder
 
 OTHER_OPTIONS = '--handle'
 PERF_SCRIPT = 'test_ds_aio.py'
@@ -277,8 +278,6 @@ def script_path():
 
 
 def async_io_setup():
-    import deepspeed
-    from deepspeed.ops.aio import AsyncIOBuilder
     return AsyncIOBuilder().is_compatible()
 
 
diff --git a/csrc/aio/py_test/ds_aio_basic.py b/csrc/aio/py_test/ds_aio_basic.py
old mode 100644
new mode 100755
index cf70b6655e9c1366371d24a6fb33808c41729e93..d7f034ad9c463c9a4ef851a6160784f705d7ff3a
--- a/csrc/aio/py_test/ds_aio_basic.py
+++ b/csrc/aio/py_test/ds_aio_basic.py
@@ -8,9 +8,10 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 import torch
 import os
 import time
-from deepspeed.ops.aio import AsyncIOBuilder
 from multiprocessing import Pool, Barrier
 from test_ds_aio_utils import report_results, task_log, task_barrier
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import AsyncIOBuilder
 
 
 def pre_basic(args, tid, read_op):
@@ -19,7 +20,10 @@ def pre_basic(args, tid, read_op):
     file = args.read_file if read_op else f'{args.write_file}.{tid}'
 
     task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
-    buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
+    buffer = get_accelerator().pin_memory(
+        torch.empty(num_bytes,
+                    dtype=torch.uint8,
+                    device='cpu'))
     task_log(
         tid,
         f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
old mode 100644
new mode 100755
index 947ee2e6cb633e52c33c4b0ce06c56ad75b73f4c..7f0e44779cb15375978dd5fd1619fb162996b0d4
--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -9,8 +9,9 @@ import torch
 import os
 import time
 from multiprocessing import Pool, Barrier
-from deepspeed.ops.aio import AsyncIOBuilder
 from test_ds_aio_utils import report_results, task_log, task_barrier
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import AsyncIOBuilder
 
 
 def pre_handle(args, tid, read_op):
@@ -18,23 +19,30 @@ def pre_handle(args, tid, read_op):
     num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
     file = args.read_file if read_op else f'{args.write_file}.{tid}'
 
-    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
-    if args.gpu:
-        buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cuda')
-    else:
-        buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
-    task_log(
-        tid,
-        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
-    )
-
     io_parallel = args.io_parallel if args.io_parallel else 1
     handle = AsyncIOBuilder().load().aio_handle(args.block_size,
                                                 args.queue_depth,
                                                 args.single_submit,
                                                 args.overlap_events,
                                                 io_parallel)
-    task_log(tid, f'created deepspeed aio handle')
+    task_log(tid, f'Created deepspeed aio handle')
+
+    if args.gpu:
+        buffer = torch.empty(num_bytes,
+                             dtype=torch.uint8,
+                             device=get_accelerator().device_name())
+    else:
+        if args.use_accelerator_pin_memory:
+            buffer = get_accelerator().pin_memory(
+                torch.empty(num_bytes,
+                            dtype=torch.uint8,
+                            device='cpu'))
+        else:
+            buffer = handle.new_cpu_locked_tensor(num_bytes,
+                                                  torch.empty(0,
+                                                              dtype=torch.uint8))
+
+    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
 
     ctxt = {}
     ctxt['file'] = file
@@ -43,6 +51,11 @@ def pre_handle(args, tid, read_op):
     ctxt['buffer'] = buffer
     ctxt['elapsed_sec'] = 0
 
+    task_log(
+        tid,
+        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
+    )
+
     return ctxt
 
 
diff --git a/csrc/aio/py_test/parse_aio_stats.py b/csrc/aio/py_test/parse_aio_stats.py
old mode 100644
new mode 100755
index 1921973e4f735ffbe0cc0d67b0f970e4c15a47ab..2a3e64944bef5badd040c7253dcd36927a7be3db
--- a/csrc/aio/py_test/parse_aio_stats.py
+++ b/csrc/aio/py_test/parse_aio_stats.py
@@ -7,7 +7,6 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 
 import os
 import argparse
-import re
 
 READ_SPEED = 'read_speed'
 WRITE_SPEED = 'write_speed'
diff --git a/csrc/aio/py_test/perf_sweep_utils.py b/csrc/aio/py_test/perf_sweep_utils.py
index 2fd1a4c55d0fd90090edb223c0303f90be5a0db9..78dd93b0a175762043e57b5773ca2dd3fa849687 100644
--- a/csrc/aio/py_test/perf_sweep_utils.py
+++ b/csrc/aio/py_test/perf_sweep_utils.py
@@ -1,3 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 SCRIPT_PREFIX = '_aio_bench'
 WRITE_OP_DESC = 'write'
 READ_OP_DESC = 'read'
diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh
old mode 100644
new mode 100755
diff --git a/csrc/aio/py_test/run_write_sweep.sh b/csrc/aio/py_test/run_write_sweep.sh
old mode 100644
new mode 100755
diff --git a/csrc/aio/py_test/test_ds_aio.py b/csrc/aio/py_test/test_ds_aio.py
old mode 100644
new mode 100755
index f97d3e676c03c13d54c54b6cc23e24745b09f335..7cb737d689e1b40d6d0ab099fbbfdf69bb4f207f
--- a/csrc/aio/py_test/test_ds_aio.py
+++ b/csrc/aio/py_test/test_ds_aio.py
@@ -6,11 +6,7 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
 import os
-import torch
 import argparse
-import time
-import sys
-from multiprocessing import Pool
 import multiprocessing as mp
 from ds_aio_basic import aio_basic_multiprocessing
 from ds_aio_handle import aio_handle_multiprocessing
@@ -67,6 +63,10 @@ def parse_arguments():
 
     parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
 
+    parser.add_argument('--use_accelerator_pin_memory',
+                        action='store_true',
+                        help='Obtain pinned (CPU page-locked) tensors from accelerator')
+
     args = parser.parse_args()
     print(f'args = {args}')
     return args
diff --git a/csrc/aio/py_test/test_ds_aio_utils.py b/csrc/aio/py_test/test_ds_aio_utils.py
old mode 100644
new mode 100755
index c68dfdddc23343c5d3c0a623a4be33f11f78e628..a330e4cd1980917fe3e3d203db68ffd0807e0212
--- a/csrc/aio/py_test/test_ds_aio_utils.py
+++ b/csrc/aio/py_test/test_ds_aio_utils.py
@@ -5,8 +5,6 @@ Licensed under the MIT license.
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
-import os
-
 BYTES_PER_GB = 1024**3
 LOG_TIDS = [0]
 
diff --git a/csrc/aio/py_test/validate_async_io.py b/csrc/aio/py_test/validate_async_io.py
index ceae84c840da9ba8902310170e1c7afe782a63ce..3b96c3c2ffba38a36e455eeb1d42f656288bc458 100644
--- a/csrc/aio/py_test/validate_async_io.py
+++ b/csrc/aio/py_test/validate_async_io.py
@@ -4,6 +4,5 @@ Licensed under the MIT license.
 
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
-import deepspeed
-from deepspeed.ops.aio import AsyncIOBuilder
+from deepspeed.ops.op_builder import AsyncIOBuilder
 assert AsyncIOBuilder().is_compatible()
diff --git a/csrc/common/custom_cuda_kernel.cu b/csrc/common/custom_cuda_kernel.cu
index f7a2b5d480dffae5f44b4558302c2c96240444ba..4c56a2562bc344417667602d63bd4af3dba63179 100644
--- a/csrc/common/custom_cuda_kernel.cu
+++ b/csrc/common/custom_cuda_kernel.cu
@@ -1,5 +1,11 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+#ifdef __HIPCC__
+#include "custom_hip_layers.h"
+#else
 #include "custom_cuda_layers.h"
-
+#endif
 __global__ void param_update_kernel(const float* input, __half* output, int size)
 {
     int id = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/csrc/includes/StopWatch.h b/csrc/includes/StopWatch.h
index 9bf0401ebc78ffbe37c3b40d2466401731358051..a5576047c96716a373913c73d864aa91c3abc75c 100644
--- a/csrc/includes/StopWatch.h
+++ b/csrc/includes/StopWatch.h
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once
 #ifdef _WIN32
 #include <windows.h>
diff --git a/csrc/includes/Timer.h b/csrc/includes/Timer.h
index efc7fff84abb86b91473d1a532c78bf16e387384..d325cae1c5310e492318f1a709d21d37cacb2289 100644
--- a/csrc/includes/Timer.h
+++ b/csrc/includes/Timer.h
@@ -1,3 +1,6 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
 
 #ifndef __TIMER_H__
 #define __TIMER_H__
diff --git a/csrc/includes/context.h b/csrc/includes/context.h
index 5f0424116546f3080c760338d4e02e40ae63be59..a8968ba1f0b479481c0aea2e88d58ff805972513 100644
--- a/csrc/includes/context.h
+++ b/csrc/includes/context.h
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once
 
 #include <ATen/cuda/CUDAContext.h>
diff --git a/csrc/includes/conversion_utils.h b/csrc/includes/conversion_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..f033e357f3b63e6e531fc6525fe110938fe2bdc3
--- /dev/null
+++ b/csrc/includes/conversion_utils.h
@@ -0,0 +1,626 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#pragma once
+
+#include "ds_kernel_utils.h"
+
+#include <cuda_fp16.h>
+#include <stdint.h>
+
+#ifdef BF16_AVAILABLE
+#include <cuda_bf16.h>
+#endif
+
+namespace conversion {
+
+// Basic primitive for constructing conversions
+template <typename TO, typename FROM>
+DS_D_INLINE TO to(FROM val)
+{
+    return to(val);
+}
+
+// Specializations
+
+/********************* Identity Conversions *********************/
+/*
+Identity conversions are useful in templated functions where we might have
+a fixed destination type. For example, I might have a kernel that accepts
+__half, __nv_bfloat16, and float but always want to do the core computation
+at floating point:
+
+T mem_value = input[idx];
+float compute_value = conversion::to<float, T>(mem_value);
+
+In practice, we should be able to elide the second template parameter:
+float compute_val = conversion::to<float>(mem_value);
+
+In this case, we need an implementation to handle the T = float case
+
+NOTE: The type inferencing system appears to be unable to handle inferring the first
+template parameter, even in the trivial case.
+*/
+
+// Floating point types
+template <>
+DS_D_INLINE double to(double val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE float to(float val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE __half to(__half val)
+{
+    return val;
+}
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE __nv_bfloat16 to(__nv_bfloat16 val)
+{
+    return val;
+}
+#endif
+
+// Integer types
+template <>
+DS_D_INLINE int8_t to(int8_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE uint8_t to(uint8_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE int16_t to(int16_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE uint16_t to(uint16_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE int32_t to(int32_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE uint32_t to(uint32_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE int64_t to(int64_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE uint64_t to(uint64_t val)
+{
+    return val;
+}
+
+// TODO: evaluate if we want bools
+
+/*********************  To Double Conversions *********************/
+
+// * to double variants
+
+// Would normally like to not use C cast, but this is an important enough conversion
+// to keep
+template <>
+DS_D_INLINE double to(float val)
+{
+#ifdef PTX_AVAILABLE
+    double ret_val;
+    asm("ctv.rn.f64.f32 %0, %1;\n" : "=d"(ret_val) : "f"(val));
+    return ret_val;
+#else
+    return double(val);
+#endif
+}
+// Note: there is a CVT instruction for __half -> double, but there's no inline interface
+// for passing a single half value
+template <>
+DS_D_INLINE double to(__half val)
+{
+    return to<double>(__half2float(val));
+}
+template <>
+DS_D_INLINE double to(int64_t val)
+{
+    return __ll2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(int32_t val)
+{
+    return __int2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(int16_t val)
+{
+    return __int2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(int8_t val)
+{
+    return __int2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(uint64_t val)
+{
+    return __ull2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(uint32_t val)
+{
+    return __uint2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(uint16_t val)
+{
+    return __uint2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(uint8_t val)
+{
+    return __uint2double_rn(val);
+}
+
+// Same applies here
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE double to(__nv_bfloat16 val)
+{
+    return to<double>(__bfloat162float(val));
+}
+#endif
+
+/*********************  To Float Conversions *********************/
+
+template <>
+DS_D_INLINE float to(double val)
+{
+    return __double2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(__half val)
+{
+    return __half2float(val);
+}
+template <>
+DS_D_INLINE float to(int64_t val)
+{
+    return __ll2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(int32_t val)
+{
+    return __int2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(int16_t val)
+{
+    return __int2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(int8_t val)
+{
+    return __int2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(uint64_t val)
+{
+    return __ull2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(uint32_t val)
+{
+    return __uint2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(uint16_t val)
+{
+    return __uint2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(uint8_t val)
+{
+    return __uint2float_rn(val);
+}
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE float to(__nv_bfloat16 val)
+{
+    return __bfloat162float(val);
+}
+#endif
+
+/*********************  To Float2 Conversions *********************/
+template <>
+DS_D_INLINE float2 to(__half2 val)
+{
+    return __half22float2(val);
+}
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE float2 to(__nv_bfloat162 val)
+{
+    return __bfloat1622float2(val);
+}
+#endif
+
+/*********************  To Half Conversions *********************/
+//aiss
+//template <>
+//DS_D_INLINE __half to(double val)
+//{
+//    return __double2half(val);
+//}
+template <>
+DS_D_INLINE __half to(float val)
+{
+    return __float2half(val);
+}
+template <>
+DS_D_INLINE __half to(int64_t val)
+{
+    return __ll2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(int32_t val)
+{
+    return __int2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(int16_t val)
+{
+    return __short2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(int8_t val)
+{
+    return __int2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(uint64_t val)
+{
+    return __ull2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(uint32_t val)
+{
+    return __uint2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(uint16_t val)
+{
+    return __ushort2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(uint8_t val)
+{
+    return __uint2half_rn(val);
+}
+
+#ifdef BF16_AVAILABLE
+// No direct conversion
+template <>
+DS_D_INLINE __half to(__nv_bfloat16 val)
+{
+    return to<__half>(to<float>(val));
+}
+#endif
+
+/*********************  To Half2 Conversions *********************/
+template <>
+DS_D_INLINE __half2 to(float2 val)
+{
+    return __float22half2_rn(val);
+}
+
+#ifdef BF16_AVAILABLE
+// No direct conversion
+template <>
+DS_D_INLINE __half2 to(__nv_bfloat162 val)
+{
+    return to<__half2>(to<float2>(val));
+}
+#endif
+
+/*********************  To BF16 Conversions *********************/
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE __nv_bfloat16 to(double val)
+{
+    return __double2bfloat16(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(float val)
+{
+    return __float2bfloat16(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(int64_t val)
+{
+    return __ll2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(int32_t val)
+{
+    return __int2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(int16_t val)
+{
+    return __short2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(int8_t val)
+{
+    return __int2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(uint64_t val)
+{
+    return __ull2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(uint32_t val)
+{
+    return __uint2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(uint16_t val)
+{
+    return __ushort2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(uint8_t val)
+{
+    return __uint2bfloat16_rn(val);
+}
+#endif
+
+/*********************  To BF162 Conversions *********************/
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE __nv_bfloat162 to(float2 val)
+{
+    return __float22bfloat162_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat162 to(__half2 val)
+{
+    return to<__nv_bfloat162>(to<float2>(val));
+}
+#endif
+
+/*********************  To INT64_T Conversions *********************/
+template <>
+DS_D_INLINE int64_t to(double val)
+{
+    return __double2ll_rn(val);
+}
+template <>
+DS_D_INLINE int64_t to(float val)
+{
+    return __float2ll_rn(val);
+}
+template <>
+DS_D_INLINE int64_t to(__half val)
+{
+    return __half2ll_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE int64_t to(__nv_bfloat16 val)
+{
+    return __bfloat162ll_rn(val);
+}
+#endif
+
+/*********************  To INT32_T Conversions *********************/
+template <>
+DS_D_INLINE int32_t to(double val)
+{
+    return __double2int_rn(val);
+}
+template <>
+DS_D_INLINE int32_t to(float val)
+{
+    return __float2int_rn(val);
+}
+template <>
+DS_D_INLINE int32_t to(__half val)
+{
+    return __half2int_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE int32_t to(__nv_bfloat16 val)
+{
+    return __bfloat162int_rn(val);
+}
+#endif
+
+/*********************  To INT16_T Conversions *********************/
+template <>
+DS_D_INLINE int16_t to(double val)
+{
+    return __double2int_rn(val);
+}
+template <>
+DS_D_INLINE int16_t to(float val)
+{
+    return __float2int_rn(val);
+}
+template <>
+DS_D_INLINE int16_t to(__half val)
+{
+    return __half2int_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE int16_t to(__nv_bfloat16 val)
+{
+    return __bfloat162int_rn(val);
+}
+#endif
+
+/*********************  To INT8_T Conversions *********************/
+template <>
+DS_D_INLINE int8_t to(double val)
+{
+    return __double2int_rn(val);
+}
+template <>
+DS_D_INLINE int8_t to(float val)
+{
+    return __float2int_rn(val);
+}
+template <>
+DS_D_INLINE int8_t to(__half val)
+{
+    return __half2int_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE int8_t to(__nv_bfloat16 val)
+{
+    return __bfloat162int_rn(val);
+}
+#endif
+
+/*********************  To UINT64_T Conversions *********************/
+template <>
+DS_D_INLINE uint64_t to(double val)
+{
+    return __double2ull_rn(val);
+}
+template <>
+DS_D_INLINE uint64_t to(float val)
+{
+    return __float2ull_rn(val);
+}
+template <>
+DS_D_INLINE uint64_t to(__half val)
+{
+    return __half2ull_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE uint64_t to(__nv_bfloat16 val)
+{
+    return __bfloat162ull_rn(val);
+}
+#endif
+
+/*********************  To UINT32_T Conversions *********************/
+template <>
+DS_D_INLINE uint32_t to(double val)
+{
+    return __double2uint_rn(val);
+}
+template <>
+DS_D_INLINE uint32_t to(float val)
+{
+    return __float2uint_rn(val);
+}
+template <>
+DS_D_INLINE uint32_t to(__half val)
+{
+    return __half2uint_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE uint32_t to(__nv_bfloat16 val)
+{
+    return __bfloat162uint_rn(val);
+}
+#endif
+
+/*********************  To UINT16_T Conversions *********************/
+template <>
+DS_D_INLINE uint16_t to(double val)
+{
+    return __double2uint_rn(val);
+}
+template <>
+DS_D_INLINE uint16_t to(float val)
+{
+    return __float2uint_rn(val);
+}
+template <>
+DS_D_INLINE uint16_t to(__half val)
+{
+    return __half2uint_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE uint16_t to(__nv_bfloat16 val)
+{
+    return __bfloat162uint_rn(val);
+}
+#endif
+
+/*********************  To UINT8_T Conversions *********************/
+template <>
+DS_D_INLINE uint8_t to(double val)
+{
+    return __double2uint_rn(val);
+}
+template <>
+DS_D_INLINE uint8_t to(float val)
+{
+    return __float2uint_rn(val);
+}
+template <>
+DS_D_INLINE uint8_t to(__half val)
+{
+    return __half2uint_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE uint8_t to(__nv_bfloat16 val)
+{
+    return __bfloat162uint_rn(val);
+}
+#endif
+
+}  // namespace conversion
diff --git a/csrc/includes/cpu_adagrad.h b/csrc/includes/cpu_adagrad.h
index 6c21b7c8e82d36ae62c11d3cc8dfbc153af19549..0dda4f759ece406d905d07ea1f879891122285c3 100644
--- a/csrc/includes/cpu_adagrad.h
+++ b/csrc/includes/cpu_adagrad.h
@@ -1,39 +1,54 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once
 
 #define NOMINMAX  // Windows idiosyncrasy
                   // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
 
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
 #include <stdio.h>
 #include <cassert>
+#include "simd.h"
+
+#if defined(__ENABLE_CUDA__)
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
 #include "cuda.h"
 #include "custom_cuda_layers.h"
-#include "simd.h"
+typedef __half ds_half_precision_t;
+#else
+typedef unsigned short ds_half_precision_t;
+#endif
 
-#define STEP(SPAN)                                \
-    void Step_##SPAN(float* _params,              \
-                     float* grads,                \
-                     float* _exp_avg_sq,          \
-                     size_t _param_size,          \
-                     __half* dev_param = nullptr, \
+#define STEP(SPAN)                                             \
+    void Step_##SPAN(float* _params,                           \
+                     float* grads,                             \
+                     float* _exp_avg_sq,                       \
+                     size_t _param_size,                       \
+                     ds_half_precision_t* dev_param = nullptr, \
                      bool half_precision = false);
 
 class Adagrad_Optimizer {
 public:
     Adagrad_Optimizer(float alpha = 1e-2, float eps = 1e-8, float weight_decay = 0)
-        : _alpha(alpha), _eps(eps), _weight_decay(weight_decay), _buf_index(false)
+        : _alpha(alpha), _eps(eps), _weight_decay(weight_decay)
     {
+#if defined(__ENABLE_CUDA__)
         cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
         cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
 
         _streams[0] = Context::Instance().GetCurrentStream();
         _streams[1] = Context::Instance().GetNewStream();
+        _buf_index = false;
+#endif
     }
     ~Adagrad_Optimizer()
     {
+#if defined(__ENABLE_CUDA__)
         cudaFreeHost(_doubled_buffer[0]);
         cudaFreeHost(_doubled_buffer[1]);
+#endif
     }
 #if defined(__AVX512__) or defined(__AVX256__)
     template <int span>
@@ -42,16 +57,18 @@ public:
                   float* grads,
                   float* _exp_avg_sq,
                   size_t param_size,
-                  __half* dev_param = nullptr,
+                  ds_half_precision_t* dev_param = nullptr,
                   bool half_precision = false);
 #endif
     STEP(1)
     STEP(4)
     STEP(8)
+#if defined(__ENABLE_CUDA__)
     inline void SynchronizeStreams()
     {
         for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
     }
+#endif
     inline void IncrementStep(size_t step)
     {
         _step++;
@@ -73,10 +90,11 @@ private:
     float _betta2_t;
     size_t _step;
 
-    float* _doubled_buffer[2];
+#if defined(__ENABLE_CUDA__)
     bool _buf_index;
-
+    float* _doubled_buffer[2];
     cudaStream_t _streams[2];
+#endif
 };
 
 #if defined(__AVX512__) or defined(__AVX256__)
@@ -86,7 +104,7 @@ void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
                                  float* grads,
                                  float* _exp_avg_sq,
                                  size_t _param_size,
-                                 __half* dev_params,
+                                 ds_half_precision_t* dev_params,
                                  bool half_precision)
 {
     size_t new_rounded_size = 0;
@@ -104,7 +122,9 @@ void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
         size_t copy_size = TILE;
         if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
         size_t offset = copy_size + t;
+#if defined(__ENABLE_CUDA__)
         if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+#endif
 #pragma omp parallel for
         for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
             AVX_Data grad_4[span];
@@ -128,12 +148,14 @@ void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
             simd_fma<span>(param_4, grad_4, step_size_4, param_4);
 
             simd_store<span>(_params + i, param_4, half_precision);
+#if defined(__ENABLE_CUDA__)
             if (dev_params) {
                 simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
             }
+#endif
             simd_store<span>(_exp_avg_sq + i, variance_4, false);
         }
-
+#if defined(__ENABLE_CUDA__)
         if (dev_params) {
             if (half_precision)
                 launch_param_update_half(
@@ -144,6 +166,7 @@ void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
 
             _buf_index = !_buf_index;
         }
+#endif
     }
     *rounded_size = new_rounded_size;
 }
diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h
index 09677c6842dee6a4a9abe835c245864f07739aa9..e9e139aa849273666c2fce5d668cf0dc938d162a 100644
--- a/csrc/includes/cpu_adam.h
+++ b/csrc/includes/cpu_adam.h
@@ -1,23 +1,34 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once
 
 #define NOMINMAX  // Windows idiosyncrasy
                   // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
 
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
 #include <stdio.h>
 #include <cassert>
+#include "simd.h"
+
+#if defined(__ENABLE_CUDA__)
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
 #include "cuda.h"
 #include "custom_cuda_layers.h"
-#include "simd.h"
+typedef __half ds_half_precision_t;
+#else
+#include <cmath>
+typedef unsigned short ds_half_precision_t;
+#endif
 
-#define STEP(SPAN)                                \
-    void Step_##SPAN(float* _params,              \
-                     float* grads,                \
-                     float* _exp_avg,             \
-                     float* _exp_avg_sq,          \
-                     size_t _param_size,          \
-                     __half* dev_param = nullptr, \
+#define STEP(SPAN)                                             \
+    void Step_##SPAN(float* _params,                           \
+                     float* grads,                             \
+                     float* _exp_avg,                          \
+                     float* _exp_avg_sq,                       \
+                     size_t _param_size,                       \
+                     ds_half_precision_t* dev_param = nullptr, \
                      bool half_precision = false);
 
 class Adam_Optimizer {
@@ -36,20 +47,25 @@ public:
           _betta1_t(1.0),
           _betta2_t(1.0),
           _step(0),
-          _buf_index(false),
           _adamw_mode(adamw_mode)
     {
+#if defined(__ENABLE_CUDA__)
         cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
         cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
 
         _streams[0] = Context::Instance().GetCurrentStream();
         _streams[1] = Context::Instance().GetNewStream();
+        _buf_index = false;
+#endif
     }
     ~Adam_Optimizer()
     {
+#if defined(__ENABLE_CUDA__)
         cudaFreeHost(_doubled_buffer[0]);
         cudaFreeHost(_doubled_buffer[1]);
+#endif
     }
+
 #if defined(__AVX512__) or defined(__AVX256__)
     template <int span>
     void Step_AVX(size_t* rounded_size,
@@ -58,16 +74,18 @@ public:
                   float* _exp_avg,
                   float* _exp_avg_sq,
                   size_t param_size,
-                  __half* dev_param = nullptr,
+                  ds_half_precision_t* dev_param = nullptr,
                   bool half_precision = false);
 #endif
     STEP(1)
     STEP(4)
     STEP(8)
+#if defined(__ENABLE_CUDA__)
     inline void SynchronizeStreams()
     {
         for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
     }
+#endif
     inline void IncrementStep(size_t step, float beta1, float beta2)
     {
         if (beta1 != _betta1 || beta2 != _betta2) {
@@ -116,11 +134,13 @@ private:
     float _bias_correction1;
     float _bias_correction2;
 
-    float* _doubled_buffer[2];
-    bool _buf_index;
     bool _adamw_mode;
 
+#if defined(__ENABLE_CUDA__)
+    float* _doubled_buffer[2];
     cudaStream_t _streams[2];
+    bool _buf_index;
+#endif
 };
 
 #if defined(__AVX512__) or defined(__AVX256__)
@@ -131,10 +151,11 @@ void Adam_Optimizer::Step_AVX(size_t* rounded_size,
                               float* _exp_avg,
                               float* _exp_avg_sq,
                               size_t _param_size,
-                              __half* dev_params,
+                              ds_half_precision_t* dev_params,
                               bool half_precision)
 {
     size_t new_rounded_size = 0;
+    int rshft = half_precision ? 1 : 0;
 
     AVX_Data betta1_4;
     betta1_4.data = SIMD_SET(_betta1);
@@ -167,11 +188,13 @@ void Adam_Optimizer::Step_AVX(size_t* rounded_size,
         size_t copy_size = TILE;
         if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
         size_t offset = copy_size + t;
+#if defined(__ENABLE_CUDA__)
         if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+#endif
 #pragma omp parallel for
         for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
             AVX_Data grad_4[span];
-            simd_load<span>(grad_4, grads + i, half_precision);
+            simd_load<span>(grad_4, grads + (i >> rshft), half_precision);
 
             AVX_Data momentum_4[span];
             simd_load<span>(momentum_4, _exp_avg + i, false);
@@ -180,7 +203,7 @@ void Adam_Optimizer::Step_AVX(size_t* rounded_size,
             simd_load<span>(variance_4, _exp_avg_sq + i, false);
 
             AVX_Data param_4[span];
-            simd_load<span>(param_4, _params + i, half_precision);
+            simd_load<span>(param_4, _params + (i >> rshft), half_precision);
 
             if (_weight_decay > 0 && !_adamw_mode) {
                 simd_fma<span>(grad_4, param_4, weight_decay4, grad_4);
@@ -201,14 +224,16 @@ void Adam_Optimizer::Step_AVX(size_t* rounded_size,
 
             simd_fma<span>(param_4, grad_4, step_size_4, param_4);
 
-            simd_store<span>(_params + i, param_4, half_precision);
+            simd_store<span>(_params + (i >> rshft), param_4, half_precision);
+#if defined(__ENABLE_CUDA__)
             if (dev_params) {
                 simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
             }
+#endif
             simd_store<span>(_exp_avg + i, momentum_4, false);
             simd_store<span>(_exp_avg_sq + i, variance_4, false);
         }
-
+#if defined(__ENABLE_CUDA__)
         if (dev_params) {
             if (half_precision)
                 launch_param_update_half(
@@ -219,6 +244,7 @@ void Adam_Optimizer::Step_AVX(size_t* rounded_size,
 
             _buf_index = !_buf_index;
         }
+#endif
     }
     *rounded_size = new_rounded_size;
 }
diff --git a/csrc/includes/cublas_wrappers.h b/csrc/includes/cublas_wrappers.h
index 9bb6cc30f6ae5c00013e75051118e2618a18312c..68e7f48e5083ba3990e0e0ab0121addf142e52be 100644
--- a/csrc/includes/cublas_wrappers.h
+++ b/csrc/includes/cublas_wrappers.h
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once
 
 #include <assert.h>
diff --git a/csrc/includes/custom_cuda_layers.h b/csrc/includes/custom_cuda_layers.h
index 30c633f72914f97fc0aa9f55f86cf963e7b49eea..7ac04154dc57396986c75e3f201023529732a9f8 100644
--- a/csrc/includes/custom_cuda_layers.h
+++ b/csrc/includes/custom_cuda_layers.h
@@ -1,21 +1,17 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
 #pragma once
 
+#include "ds_kernel_utils.h"
+
 #include <cuda.h>
 #include <cuda_fp16.h>
+#include <curand_kernel.h>
 #include <stdio.h>
 #include <stdlib.h>
 
-#ifdef __HIP_PLATFORM_HCC__
-#define HALF_PRECISION_AVAILABLE = 1
-#include <hip/hip_cooperative_groups.h>
-#else
-#if __CUDA_ARCH__ >= 700
-#define HALF_PRECISION_AVAILABLE = 1
-#endif
-#include <cooperative_groups.h>
-#endif
-#include <curand_kernel.h>
-
 #include "context.h"
 #include "cublas_wrappers.h"
 
@@ -45,30 +41,6 @@
 
 #define WARP_SIZE_BITS 5
 
-template <typename T>
-void launch_quantize_kernel(T* vals,
-                            int total_count,
-                            int group_num,
-                            int num_bits,
-                            cudaStream_t stream);
-template <typename T>
-void launch_sr_quantize_kernel(T* vals,
-                               int total_count,
-                               int group_num,
-                               int num_bits,
-                               cudaStream_t stream);
-template <typename T>
-void launch_quantize_kernel_asym(T* vals,
-                                 int total_count,
-                                 int group_num,
-                                 int num_bits,
-                                 cudaStream_t stream);
-template <typename T>
-void launch_sr_quantize_kernel_asym(T* vals,
-                                    int total_count,
-                                    int group_num,
-                                    int num_bits,
-                                    cudaStream_t stream);
 // Fused bias add with gelu activation
 template <typename T>
 void launch_bias_gelu(const T* input,
@@ -301,3 +273,54 @@ void launch_fuse_transpose_bias_kernel(const T* inp,
 
 void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream);
 void launch_param_update_half(const float* input, __half* output, int size, cudaStream_t stream);
+
+void launch_token_sort(int32_t* indices,
+                       int layers,
+                       int batch_size,
+                       int reserved_size,
+                       int original_tokens,
+                       cudaStream_t stream);
+
+template <typename T>
+void launch_gather_tokens(T* retained_tokens,
+                          T* activations,
+                          int32_t* gather_indices,
+                          int32_t batch_size,
+                          int32_t sampled_tokens,
+                          int32_t channels,
+                          int32_t read_batch_stride,
+                          int32_t read_seq_stride,
+                          int32_t write_batch_stride,
+                          int32_t write_seq_stride,
+                          cudaStream_t stream);
+
+template <typename T>
+void launch_scatter_tokens(T* all_activations,
+                           T* layer_activations,
+                           int32_t* gather_indices,
+                           int32_t batch_size,
+                           int32_t sampled_tokens,
+                           int32_t channels,
+                           int32_t read_batch_stride,
+                           int32_t read_seq_stride,
+                           int32_t write_batch_stride,
+                           int32_t write_seq_stride,
+                           cudaStream_t stream);
+
+template <typename T>
+void launch_slice_gpt_mask(T* output_mask,
+                           const T* input_mask,
+                           int batch_size,
+                           int truncated_seq_len,
+                           int orig_seq_len,
+                           cudaStream_t stream);
+
+template <typename T>
+void launch_slice_bert_mask(T* output_mask,
+                            const T* input_mask,
+                            const int32_t* retained_indices,
+                            int32_t layers,
+                            int32_t batch_size,
+                            int32_t truncated_seq_len,
+                            int32_t orig_seq_len,
+                            cudaStream_t stream);
diff --git a/csrc/includes/dequantization_utils.h b/csrc/includes/dequantization_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..fea7505c8a13041c80b778be3434b1531fe2f6b5
--- /dev/null
+++ b/csrc/includes/dequantization_utils.h
@@ -0,0 +1,176 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#include "conversion_utils.h"
+#include "ds_kernel_utils.h"
+#include "quantization.h"
+#include "quantization_utils.h"
+
+namespace cg = cooperative_groups;
+
+#pragma once
+
+namespace dequantize {
+using Type = quantize::Type;
+
+template <Type qType, int numBits>
+using Params = quantize::Params<qType, numBits>;
+
+constexpr int granularity = quantize::granularity;
+using PackedInt4 = quantize::PackedInt4;
+
+constexpr int h_per_chunk = granularity / sizeof(__half);
+constexpr int h2_per_chunk = granularity / sizeof(__half2);
+
+/*
+Device function that reads quantized data from global memory, dequantizes
+it, and stores it to global memory.
+Template Arguments :
+    numBits - Number of bits in quantized element.      int: 4, 8
+    qType - Type of quantization to perform.            Type::Symmetric or Type::Asymmetric
+    unroll - Number of load steps to internally unroll  int
+    threads - Number of threads to perform dequant      int
+Function arguments:
+    global_output - __half pointer in global memory
+    data - Quantized data in global memory
+    global_params - Quantization parameters in global memory
+    elems_per_group - Number of elements in each quantization group
+    total_elems - Tensor size (note, does not need to be multiple of elems_per_group)
+*/
+template <int numBits, Type qType, int unroll, int threads>
+DS_D_INLINE void to_global(__half* global_output,
+                           const int8_t* data,
+                           const float* global_params,
+                           const int elems_per_group,
+                           const int total_elems);
+
+/*
+Device function that quantizes 16 bytes of __half type input data.
+Template Arguments :
+    numBits -   Number of bits in quantized element.    int : 8 or 4
+    qType   - Type of quantization to perform.          Type::Symmetric or Type::Asymmetric
+Function Arguments :
+    local_output -  Local array to store dequantized data       __half* or __half2*
+    data         -  Pointer to quantized input data.            int8_t*
+    Params       -  Parameters for quantization.                Params<qType, numBits>
+*/
+template <int numBits, Type qType>
+DS_D_INLINE void chunk(__half2* local_output, const int8_t* data, Params<qType, numBits> q_params);
+
+template <typename T, int numBits, Type qType>
+DS_D_INLINE void chunk(T* local_output, const int8_t* data, Params<qType, numBits> q_params);
+
+/**************** Implementations ******************/
+
+template <typename T, int numBits, Type qType>
+DS_D_INLINE void chunk(T* local_output, const int8_t* data, Params<qType, numBits> q_params)
+{
+    constexpr int32_t num_elems_packed = 8 / numBits;
+    constexpr int32_t iters = h_per_chunk / num_elems_packed;
+
+#pragma unroll
+    for (int i = 0; i < iters; i++) {
+        if constexpr (num_elems_packed == 1) {
+            local_output[i] = q_params.template dequantize<T>(data[i]);
+        } else {
+            auto accessible_data = *(PackedInt4*)(&data[i]);
+            local_output[2 * i] = q_params.template dequantize<T>(accessible_data.low);
+            local_output[2 * i + 1] = q_params.template dequantize<T>(accessible_data.high);
+        }
+    }
+}
+
+template <int numBits, Type qType>
+DS_D_INLINE void chunk(__half2* local_output, const int8_t* data, Params<qType, numBits> q_params)
+{
+    __half* local_output_cast = reinterpret_cast<__half*>(local_output);
+    chunk<__half, numBits>(local_output_cast, data, q_params);
+}
+
+template <typename T, int numBits, Type qType, int unroll, int threads>
+DS_D_INLINE void _to_global(T* global_output,
+                            const int8_t* data,
+                            const float* global_params,
+                            const int elems_per_group,
+                            const int total_elems)
+{
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    // Load constants
+    // TODO(cmikeh2): Refactor into functions?
+    constexpr int load_granularity = (granularity / (sizeof(T))) / (numBits == 8 ? 1 : 2);
+    constexpr int load_step_stride = load_granularity * threads;
+    constexpr int load_block_stride = load_step_stride * unroll;
+
+    // Store constants
+    constexpr int T_per_chunk = granularity / sizeof(T);
+    constexpr int store_step_stride = T_per_chunk * threads;
+    constexpr int store_block_stride = store_step_stride * unroll;
+
+    // Load offsets
+    const int load_block_offset = tb.group_index().x * load_block_stride;
+    // Note: we can use `load_granularity` since the dtype is `int8_t`.
+    const int load_thread_offset = tb.thread_index().x * load_granularity;
+    const int8_t* load_base = data + load_block_offset + load_thread_offset;
+
+    // Store offsets
+    const int store_block_offset = tb.group_index().x * store_block_stride;
+    const int store_thread_offset = tb.thread_index().x * T_per_chunk;
+    const int elem_id_base = store_block_offset + store_thread_offset;
+
+    int8_t local_load_buffer[load_granularity * unroll];
+    T local_dequant_buffer[T_per_chunk * unroll];
+
+    /*
+    Note: Splitting this loop in half gave about 3-5% performance increase for reasons that aren't
+    totally clear to me, so this is a deliberately weird code structure.
+    */
+#pragma unroll
+    for (int i = 0; i < unroll; i++) {
+        const int elem_id_iter = elem_id_base + i * store_step_stride;
+
+        if (elem_id_iter < total_elems) {
+            mem_access::load_global<load_granularity>(local_load_buffer + i * load_granularity,
+                                                      load_base + i * load_step_stride);
+        }
+    }
+
+#pragma unroll
+    for (int i = 0; i < unroll; i++) {
+        const int elem_id_iter = elem_id_base + i * store_step_stride;
+        if (elem_id_iter < total_elems) {
+            // TODO(cmikeh2): Can we amortize this division? Perform once on the first iteration and
+            // use indexing math to do division free interpolation of the successive groups?
+            const int group_index = elem_id_iter / elems_per_group;
+            Params<qType, numBits> q_params(global_params, group_index);
+
+            chunk<T, numBits, qType>(local_dequant_buffer + i * T_per_chunk,
+                                     local_load_buffer + i * load_granularity,
+                                     q_params);
+            mem_access::store_global<granularity>(global_output + elem_id_iter,
+                                                  local_dequant_buffer + i * T_per_chunk);
+        }
+    }
+}
+
+template <typename T, int numBits, Type qType, int unroll, int threads>
+DS_D_INLINE void to_global(T* global_output,
+                           const int8_t* data,
+                           const float* global_params,
+                           const int elems_per_group,
+                           const int total_elems)
+{
+    if constexpr (numBits == 4 || numBits == 8) {
+        _to_global<T, numBits, qType, unroll, threads>(
+            global_output, data, global_params, elems_per_group, total_elems);
+    } else if constexpr (numBits == 3) {
+        // TODO(cmikeh2): Need this implementation
+        assert(false);
+    } else {
+        assert(false);
+    }
+}
+
+}  // namespace dequantize
diff --git a/csrc/includes/dropout.h b/csrc/includes/dropout.h
index a72572d0876e4ed30cb98270654fed8c201db6f0..2fdf224952954a23548db7ddb4c49119626e8f83 100644
--- a/csrc/includes/dropout.h
+++ b/csrc/includes/dropout.h
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once
 
 #include <cuda.h>
diff --git a/csrc/includes/ds_kernel_utils.h b/csrc/includes/ds_kernel_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..10210663fcfb881af18ccdf9aa6c005f8fa6221f
--- /dev/null
+++ b/csrc/includes/ds_kernel_utils.h
@@ -0,0 +1,49 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+
+Centralized header file for preprocessor macros and constants
+used throughout the codebase.
+*/
+
+#pragma once
+
+#include <cuda.h>
+
+#define DS_HD_INLINE __host__ __device__ __forceinline__
+#define DS_D_INLINE __device__ __forceinline__
+
+#ifdef __HIP_PLATFORM_HCC__
+
+// constexpr variant of warpSize for templating
+constexpr int hw_warp_size = 64;
+#define HALF_PRECISION_AVAILABLE = 1
+#include <hip/hip_cooperative_groups.h>
+
+#else  // !__HIP_PLATFORM_HCC__
+
+// constexpr variant of warpSize for templating
+constexpr int hw_warp_size = 32;
+
+#if __CUDA_ARCH__ >= 530
+#define HALF_PRECISION_AVAILABLE = 1
+#define PTX_AVAILABLE
+#endif  // __CUDA_ARCH__ >= 530
+
+#if __CUDA_ARCH__ >= 800
+#define ASYNC_COPY_AVAILABLE
+#define BF16_AVAILABLE
+#endif  // __CUDA_ARCH__ >= 800
+
+#include <cooperative_groups.h>
+
+#endif  //__HIP_PLATFORM_HCC__
+
+inline int next_pow2(const int val)
+{
+    int rounded_val = val - 1;
+    rounded_val |= rounded_val >> 1;
+    rounded_val |= rounded_val >> 2;
+    rounded_val |= rounded_val >> 4;
+    rounded_val |= rounded_val >> 8;
+    return rounded_val + 1;
+}
diff --git a/csrc/includes/ds_transformer_cuda.h b/csrc/includes/ds_transformer_cuda.h
old mode 100644
new mode 100755
index 09afeb9d4b1950f0fc01cad7e7963359b485970c..d289a24c6b4cd37c83a0bbb192f3e437c77e3d6e
--- a/csrc/includes/ds_transformer_cuda.h
+++ b/csrc/includes/ds_transformer_cuda.h
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once
 
 #include <cuda_runtime_api.h>
diff --git a/csrc/includes/feed_forward.h b/csrc/includes/feed_forward.h
index de7a9cf1bf9eaf686f387e4dd1b3a45b02f28e85..0f2ece4aabd30caa7829edd5231abd330902ae78 100644
--- a/csrc/includes/feed_forward.h
+++ b/csrc/includes/feed_forward.h
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #ifndef __FEEDFORWARD_H__
 #define __FEEDFORWARD_H__
 
diff --git a/csrc/includes/gelu.h b/csrc/includes/gelu.h
index 560f4140ed61e9455b78911da0a44c8944ce53ed..07d431484482cc1a3450d929c2e58d07f4b1aed3 100644
--- a/csrc/includes/gelu.h
+++ b/csrc/includes/gelu.h
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once
 
 #include <cuda.h>
diff --git a/csrc/includes/gemm_test.h b/csrc/includes/gemm_test.h
index 22c35123f2c776e2e87d53310c316497e55d214d..7ddb8b122798fa66bc2709ffdea7393a4ec796cd 100644
--- a/csrc/includes/gemm_test.h
+++ b/csrc/includes/gemm_test.h
@@ -1,3 +1,6 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
 
 #pragma once
 
diff --git a/csrc/includes/general_kernels.h b/csrc/includes/general_kernels.h
index e949309483ce8060c86ad1a46ca9264b1f45f810..875df84195ea953e056fbccef8fc6e72e58656a0 100644
--- a/csrc/includes/general_kernels.h
+++ b/csrc/includes/general_kernels.h
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <stdio.h>
diff --git a/csrc/includes/memory_access_utils.h b/csrc/includes/memory_access_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2cdcb6ca8312fe9a0a4db99fe453bd4f341dda2
--- /dev/null
+++ b/csrc/includes/memory_access_utils.h
@@ -0,0 +1,1114 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#pragma once
+
+#include <cuda.h>
+#include "ds_kernel_utils.h"
+
+/////////////////////////////// Memory Access Utils ///////////////////////////////
+namespace mem_access {
+
+enum class LoadPolicy {
+    CacheAll,       // Cache at all levels
+    CacheGlobal,    // Cache at L2 only
+    CacheStreaming  // Cache with evict first policy
+};
+
+enum class StorePolicy {
+    Writeback,      // Cache in L1, write-back on eviction
+    CacheGlobal,    // Bypass L1, write-back on eviction
+    CacheStreaming  // Allocate cache line with evict first policy
+};
+
+template <int AccessSize, LoadPolicy policy = LoadPolicy::CacheAll>
+__device__ __forceinline__ void load_global(void* dst, const void* src);
+
+template <int AccessSize, LoadPolicy policy = LoadPolicy::CacheAll>
+__device__ __forceinline__ void load_global(void* dst, const void* src, bool do_access);
+
+// Shared accesses have no cache policy
+template <int AccessSize>
+__device__ __forceinline__ void load_shared(void* dst, const void* src);
+
+template <int AccessSize>
+__device__ __forceinline__ void load_shared(void* dst, const void* src, bool do_access);
+
+template <int AccessSize, StorePolicy policy = StorePolicy::Writeback>
+__device__ __forceinline__ void store_global(void* dst, const void* src);
+
+// Shared accesses have no cache policy
+template <int AccessSize>
+__device__ __forceinline__ void store_shared(void* dst, const void* src);
+
+#ifdef ASYNC_COPY_AVAILABLE
+template <int AccessSize>
+__device__ __forceinline__ void memcpy_async(void* shr, const void* gbl);
+
+template <int AccessSize>
+__device__ __forceinline__ void memcpy_async_nop(void* shr, const void* gbl, bool predicate);
+
+template <int AccessSize>
+__device__ __forceinline__ void memcpy_async_zero(void* shr, const void* gbl, bool predicate);
+
+__device__ __forceinline__ void memcpy_async_fence();
+
+template <int stages>
+__device__ __forceinline__ void memcpy_async_wait();
+
+template <int stages>
+__device__ __forceinline__ void tail_complete_wait(int remaining_stages);
+#endif
+
+// Util for tracking pipeline buffers
+// TODO: Evaluate whether this should also be guarded by ASYNC_COPY_AVAILABLE
+template <int max>
+class BufferTracker {
+public:
+    int current_state;
+
+    __device__ __forceinline__ BufferTracker() : current_state(0) {}
+
+    __device__ __forceinline__ int get()
+    {
+        int return_val = current_state++;
+        current_state = (current_state == max ? 0 : current_state);
+        return return_val;
+    }
+};
+
+__device__ __forceinline__ uint32_t lane_id()
+{
+#ifdef PTX_AVAILABLE
+    unsigned int lane_id;
+    asm volatile("mov.u32 %0, %%laneid;" : "=r"(lane_id));
+    return lane_id;
+#else
+    return threadIdx.x & (warpSize - 1);  // Portable
+#endif
+}
+
+/////////// Load Global ///////////
+template <>
+__device__ __forceinline__ void load_global<16>(void* dst, const void* src)
+{
+    uint4* data = reinterpret_cast<uint4*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.ca.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+                 : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w)
+                 : "l"(src));
+#else
+    const uint4* src_cast = reinterpret_cast<const uint4*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<16>(void* dst, const void* src, bool do_access)
+{
+    uint4* data = reinterpret_cast<uint4*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %5, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\tmov.b32 %1, 0;\n"
+        "\tmov.b32 %2, 0;\n"
+        "\tmov.b32 %3, 0;\n"
+        "\t@p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w)
+        : "l"(src), "r"((int)do_access));
+#else
+    const uint4* src_cast = reinterpret_cast<const uint4*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0].x = 0;
+        data[0].y = 0;
+        data[0].z = 0;
+        data[0].w = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<16, LoadPolicy::CacheGlobal>(void* dst, const void* src)
+{
+    uint4* data = reinterpret_cast<uint4*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+                 : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w)
+                 : "l"(src));
+#else
+    const uint4* src_cast = reinterpret_cast<const uint4*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<16, LoadPolicy::CacheGlobal>(void* dst,
+                                                                         const void* src,
+                                                                         bool do_access)
+{
+    uint4* data = reinterpret_cast<uint4*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %5, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\tmov.b32 %1, 0;\n"
+        "\tmov.b32 %2, 0;\n"
+        "\tmov.b32 %3, 0;\n"
+        "\t@p ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w)
+        : "l"(src), "r"((int)do_access));
+#else
+    const uint4* src_cast = reinterpret_cast<const uint4*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0].x = 0;
+        data[0].y = 0;
+        data[0].z = 0;
+        data[0].w = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<16, LoadPolicy::CacheStreaming>(void* dst,
+                                                                            const void* src)
+{
+    uint4* data = reinterpret_cast<uint4*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.cs.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+                 : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w)
+                 : "l"(src));
+#else
+    const uint4* src_cast = reinterpret_cast<const uint4*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<16, LoadPolicy::CacheStreaming>(void* dst,
+                                                                            const void* src,
+                                                                            bool do_access)
+{
+    uint4* data = reinterpret_cast<uint4*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %5, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\tmov.b32 %1, 0;\n"
+        "\tmov.b32 %2, 0;\n"
+        "\tmov.b32 %3, 0;\n"
+        "\t@p ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w)
+        : "l"(src), "r"((int)do_access));
+#else
+    const uint4* src_cast = reinterpret_cast<const uint4*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0].x = 0;
+        data[0].y = 0;
+        data[0].z = 0;
+        data[0].w = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<8>(void* dst, const void* src)
+{
+    uint2* data = reinterpret_cast<uint2*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.ca.v2.u32 {%0, %1}, [%2];\n"
+                 : "=r"(data[0].x), "=r"(data[0].y)
+                 : "l"(src));
+#else
+    const uint2* src_cast = reinterpret_cast<const uint2*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<8>(void* dst, const void* src, bool do_access)
+{
+    uint2* data = reinterpret_cast<uint2*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %3, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\tmov.b32 %1, 0;\n"
+        "\t@p ld.global.v2.u32 {%0, %1}, [%2];\n"
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y)
+        : "l"(src), "r"((int)do_access));
+#else
+    const uint2* src_cast = reinterpret_cast<const uint2*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0].x = 0;
+        data[0].y = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<8, LoadPolicy::CacheGlobal>(void* dst, const void* src)
+{
+    uint2* data = reinterpret_cast<uint2*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.cg.v2.u32 {%0, %1}, [%2];\n"
+                 : "=r"(data[0].x), "=r"(data[0].y)
+                 : "l"(src));
+#else
+    const uint2* src_cast = reinterpret_cast<const uint2*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<8, LoadPolicy::CacheGlobal>(void* dst,
+                                                                        const void* src,
+                                                                        bool do_access)
+{
+    uint2* data = reinterpret_cast<uint2*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %3, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\tmov.b32 %1, 0;\n"
+        "\t@p ld.global.cg.v2.u32 {%0, %1}, [%2];\n"
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y)
+        : "l"(src), "r"((int)do_access));
+#else
+    const uint2* src_cast = reinterpret_cast<const uint2*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0].x = 0;
+        data[0].y = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<8, LoadPolicy::CacheStreaming>(void* dst,
+                                                                           const void* src)
+{
+    uint2* data = reinterpret_cast<uint2*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.cs.v2.u32 {%0, %1}, [%2];\n"
+                 : "=r"(data[0].x), "=r"(data[0].y)
+                 : "l"(src));
+#else
+    const uint2* src_cast = reinterpret_cast<const uint2*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<8, LoadPolicy::CacheStreaming>(void* dst,
+                                                                           const void* src,
+                                                                           bool do_access)
+{
+    uint2* data = reinterpret_cast<uint2*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %3, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\tmov.b32 %1, 0;\n"
+        "\t@p ld.global.cs.v2.u32 {%0, %1}, [%2];\n"
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y)
+        : "l"(src), "r"((int)do_access));
+#else
+    const uint2* src_cast = reinterpret_cast<const uint2*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0].x = 0;
+        data[0].y = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<4>(void* dst, const void* src)
+{
+    int32_t* data = reinterpret_cast<int32_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.ca.u32 {%0}, [%1];\n" : "=r"(*data) : "l"(src));
+#else
+    const int32_t* src_cast = reinterpret_cast<const int32_t*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<4>(void* dst, const void* src, bool do_access)
+{
+    int32_t* data = reinterpret_cast<int32_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %2, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\t@p ld.global.u32 {%0}, [%1];\n"
+        "}\n"
+        : "=r"(data[0])
+        : "l"(src), "r"((int)do_access));
+#else
+    const int32_t* src_cast = reinterpret_cast<const int32_t*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0] = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<4, LoadPolicy::CacheGlobal>(void* dst, const void* src)
+{
+    int32_t* data = reinterpret_cast<int32_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.cg.u32 {%0}, [%1];\n" : "=r"(*data) : "l"(src));
+#else
+    const int32_t* src_cast = reinterpret_cast<const int32_t*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<4, LoadPolicy::CacheGlobal>(void* dst,
+                                                                        const void* src,
+                                                                        bool do_access)
+{
+    int32_t* data = reinterpret_cast<int32_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %2, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\t@p ld.global.cg.u32 {%0}, [%1];\n"
+        "}\n"
+        : "=r"(data[0])
+        : "l"(src), "r"((int)do_access));
+#else
+    const int32_t* src_cast = reinterpret_cast<const int32_t*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0] = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<4, LoadPolicy::CacheStreaming>(void* dst,
+                                                                           const void* src)
+{
+    int32_t* data = reinterpret_cast<int32_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.cs.u32 {%0}, [%1];\n" : "=r"(*data) : "l"(src));
+#else
+    const int32_t* src_cast = reinterpret_cast<const int32_t*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<4, LoadPolicy::CacheStreaming>(void* dst,
+                                                                           const void* src,
+                                                                           bool do_access)
+{
+    int32_t* data = reinterpret_cast<int32_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %2, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\t@p ld.global.cs.u32 {%0}, [%1];\n"
+        "}\n"
+        : "=r"(data[0])
+        : "l"(src), "r"((int)do_access));
+#else
+    const int32_t* src_cast = reinterpret_cast<const int32_t*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0] = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<2>(void* dst, const void* src)
+{
+    int16_t* data = reinterpret_cast<int16_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.ca.u16 {%0}, [%1];\n" : "=h"(*data) : "l"(src));
+#else
+    const int16_t* src_cast = reinterpret_cast<const int16_t*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<2>(void* dst, const void* src, bool do_access)
+{
+    int16_t* data = reinterpret_cast<int16_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %2, 0;\n"
+        "\tmov.u16 %0, 0;\n"
+        "\t@p ld.global.u16 {%0}, [%1];\n"
+        "}\n"
+        : "=h"(*data)
+        : "l"(src), "r"((int)do_access));
+#else
+    const int16_t* src_cast = reinterpret_cast<const int16_t*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0] = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<2, LoadPolicy::CacheGlobal>(void* dst, const void* src)
+{
+    int16_t* data = reinterpret_cast<int16_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.cg.u16 {%0}, [%1];\n" : "=h"(*data) : "l"(src));
+#else
+    const int16_t* src_cast = reinterpret_cast<const int16_t*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<2, LoadPolicy::CacheGlobal>(void* dst,
+                                                                        const void* src,
+                                                                        bool do_access)
+{
+    int16_t* data = reinterpret_cast<int16_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %2, 0;\n"
+        "\tmov.u16 %0, 0;\n"
+        "\t@p ld.global.cg.u16 {%0}, [%1];\n"
+        "}\n"
+        : "=h"(*data)
+        : "l"(src), "r"((int)do_access));
+#else
+    const int16_t* src_cast = reinterpret_cast<const int16_t*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0] = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<2, LoadPolicy::CacheStreaming>(void* dst,
+                                                                           const void* src)
+{
+    int16_t* data = reinterpret_cast<int16_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.cs.u16 {%0}, [%1];\n" : "=h"(*data) : "l"(src));
+#else
+    const int16_t* src_cast = reinterpret_cast<const int16_t*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<2, LoadPolicy::CacheStreaming>(void* dst,
+                                                                           const void* src,
+                                                                           bool do_access)
+{
+    int16_t* data = reinterpret_cast<int16_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %2, 0;\n"
+        "\tmov.u16 %0, 0;\n"
+        "\t@p ld.global.cs.u16 {%0}, [%1];\n"
+        "}\n"
+        : "=h"(*data)
+        : "l"(src), "r"((int)do_access));
+#else
+    const int16_t* src_cast = reinterpret_cast<const int16_t*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0] = 0;
+    }
+#endif
+}
+
+/////////// Load Shared ///////////
+namespace internal {
+
+#ifdef PTX_AVAILABLE
+__device__ __forceinline__ unsigned convert_to_shared(const void* ptr)
+{
+#if __CUDACC_VER_MAJOR__ >= 11
+    // In CUDA 11 we have a builtin intrinsic
+    return __cvta_generic_to_shared(ptr);
+#else
+    unsigned ret_val;
+    asm volatile(
+        "{\n"
+        "\t.reg .u64 p1;\n"
+        "\tcvta.to.shared.u64 p1, %1\n"
+        "\tcvt.u32.u64 %0, p1;\n"
+        "}\n"
+        : "=r"(ret_val)
+        : "l"(ptr));
+    return ret_val;
+#endif
+}
+#endif
+
+}  // namespace internal
+
+template <>
+__device__ __forceinline__ void load_shared<16>(void* dst, const void* src)
+{
+    uint4* data = reinterpret_cast<uint4*>(dst);
+#ifdef PTX_AVAILABLE
+    unsigned src_shr = internal::convert_to_shared(src);
+
+    asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+                 : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w)
+                 : "r"(src_shr));
+#else
+    const uint4* src_cast = reinterpret_cast<const uint4*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_shared<16>(void* dst, const void* src, bool do_access)
+{
+    uint4* data = reinterpret_cast<uint4*>(dst);
+#ifdef PTX_AVAILABLE
+    unsigned src_shr = internal::convert_to_shared(src);
+
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %5, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\tmov.b32 %1, 0;\n"
+        "\tmov.b32 %2, 0;\n"
+        "\tmov.b32 %3, 0;\n"
+        "\t@p ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w)
+        : "r"(src_shr), "r"((int)do_access));
+#else
+    const uint4* src_cast = reinterpret_cast<const uint4*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0].x = 0;
+        data[0].y = 0;
+        data[0].z = 0;
+        data[0].w = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_shared<8>(void* dst, const void* src)
+{
+    uint2* data = reinterpret_cast<uint2*>(dst);
+#ifdef PTX_AVAILABLE
+    unsigned src_shr = internal::convert_to_shared(src);
+
+    asm volatile("ld.shared.v2.u32 {%0, %1}, [%2];\n"
+                 : "=r"(data[0].x), "=r"(data[0].y)
+                 : "r"(src_shr));
+#else
+    const uint2* src_cast = reinterpret_cast<const uint2*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_shared<8>(void* dst, const void* src, bool do_access)
+{
+    uint2* data = reinterpret_cast<uint2*>(dst);
+#ifdef PTX_AVAILABLE
+    unsigned src_shr = internal::convert_to_shared(src);
+
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %3, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\tmov.b32 %1, 0;\n"
+        "\t@p ld.shared.v2.u32 {%0, %1}, [%2];\n"
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y)
+        : "r"(src_shr), "r"((int)do_access));
+#else
+    const uint2* src_cast = reinterpret_cast<const uint2*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0].x = 0;
+        data[0].y = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_shared<4>(void* dst, const void* src)
+{
+    int32_t* data = reinterpret_cast<int32_t*>(dst);
+#ifdef PTX_AVAILABLE
+    unsigned src_shr = internal::convert_to_shared(src);
+
+    asm volatile("ld.shared.u32 {%0}, [%1];\n" : "=r"(*data) : "r"(src_shr));
+#else
+    const int32_t* src_cast = reinterpret_cast<const int32_t*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_shared<4>(void* dst, const void* src, bool do_access)
+{
+    int32_t* data = reinterpret_cast<int32_t*>(dst);
+#ifdef PTX_AVAILABLE
+    unsigned src_shr = internal::convert_to_shared(src);
+
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %2, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\t@p ld.shared.u32 %0, [%1];\n"
+        "}\n"
+        : "=r"(data[0])
+        : "r"(src_shr), "r"((int)do_access));
+#else
+    const int32_t* src_cast = reinterpret_cast<const int32_t*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0] = 0;
+    }
+#endif
+}
+
+/////////// Store Global ///////////
+
+template <>
+__device__ __forceinline__ void store_global<16>(void* dst, const void* src)
+{
+    const uint4* data = reinterpret_cast<const uint4*>(src);
+#ifdef PTX_AVAILABLE
+    asm volatile("st.global.wb.v4.u32 [%0], {%1, %2, %3, %4};\n"
+                 :
+                 : "l"(dst), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z), "r"(data[0].w)
+                 : "memory");
+#else
+    uint4* dst_cast = reinterpret_cast<uint4*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void store_global<16, StorePolicy::CacheGlobal>(void* dst,
+                                                                           const void* src)
+{
+    const uint4* data = reinterpret_cast<const uint4*>(src);
+#ifdef PTX_AVAILABLE
+    asm volatile("st.global.cg.v4.u32 [%0], {%1, %2, %3, %4};\n"
+                 :
+                 : "l"(dst), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z), "r"(data[0].w)
+                 : "memory");
+#else
+    uint4* dst_cast = reinterpret_cast<uint4*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void store_global<16, StorePolicy::CacheStreaming>(void* dst,
+                                                                              const void* src)
+{
+    const uint4* data = reinterpret_cast<const uint4*>(src);
+#ifdef PTX_AVAILABLE
+    asm volatile("st.global.cs.v4.u32 [%0], {%1, %2, %3, %4};\n"
+                 :
+                 : "l"(dst), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z), "r"(data[0].w)
+                 : "memory");
+#else
+    uint4* dst_cast = reinterpret_cast<uint4*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void store_global<8>(void* dst, const void* src)
+{
+    const uint2* data = reinterpret_cast<const uint2*>(src);
+#ifdef PTX_AVAILABLE
+    asm volatile("st.global.wb.v2.u32 [%0], {%1, %2};\n"
+                 :
+                 : "l"(dst), "r"(data[0].x), "r"(data[0].y));
+#else
+    uint2* dst_cast = reinterpret_cast<uint2*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void store_global<8, StorePolicy::CacheGlobal>(void* dst,
+                                                                          const void* src)
+{
+    const uint2* data = reinterpret_cast<const uint2*>(src);
+#ifdef PTX_AVAILABLE
+    asm volatile("st.global.cg.v2.u32 [%0], {%1, %2};\n"
+                 :
+                 : "l"(dst), "r"(data[0].x), "r"(data[0].y));
+#else
+    uint2* dst_cast = reinterpret_cast<uint2*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void store_global<8, StorePolicy::CacheStreaming>(void* dst,
+                                                                             const void* src)
+{
+    const uint2* data = reinterpret_cast<const uint2*>(src);
+#ifdef PTX_AVAILABLE
+    asm volatile("st.global.cs.v2.u32 [%0], {%1, %2};\n"
+                 :
+                 : "l"(dst), "r"(data[0].x), "r"(data[0].y));
+#else
+    uint2* dst_cast = reinterpret_cast<uint2*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void store_global<4>(void* dst, const void* src)
+{
+    const int32_t* data = reinterpret_cast<const int32_t*>(src);
+#ifdef PTX_AVAILABLE
+    asm volatile("st.global.wb.u32 [%0], %1;\n" : : "l"(dst), "r"(*data));
+#else
+    int32_t* dst_cast = reinterpret_cast<int32_t*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void store_global<4, StorePolicy::CacheGlobal>(void* dst,
+                                                                          const void* src)
+{
+    const int32_t* data = reinterpret_cast<const int32_t*>(src);
+#ifdef PTX_AVAILABLE
+    asm volatile("st.global.cg.u32 [%0], %1;\n" : : "l"(dst), "r"(*data));
+#else
+    int32_t* dst_cast = reinterpret_cast<int32_t*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void store_global<4, StorePolicy::CacheStreaming>(void* dst,
+                                                                             const void* src)
+{
+    const int32_t* data = reinterpret_cast<const int32_t*>(src);
+#ifdef PTX_AVAILABLE
+    asm volatile("st.global.cs.u32 [%0], %1;\n" : : "l"(dst), "r"(*data));
+#else
+    int32_t* dst_cast = reinterpret_cast<int32_t*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+/////////// Store Shared ///////////
+
+template <>
+__device__ __forceinline__ void store_shared<16>(void* dst, const void* src)
+{
+    const uint4* data = reinterpret_cast<const uint4*>(src);
+#ifdef PTX_AVAILABLE
+    unsigned dst_int = internal::convert_to_shared(dst);
+
+    asm volatile("st.shared.v4.u32 [%0], {%1, %2, %3, %4};\n"
+                 :
+                 : "r"(dst_int), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z), "r"(data[0].w));
+#else
+    uint4* dst_cast = reinterpret_cast<uint4*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void store_shared<8>(void* dst, const void* src)
+{
+    const uint2* data = reinterpret_cast<const uint2*>(src);
+#ifdef PTX_AVAILABLE
+    unsigned dst_int = internal::convert_to_shared(dst);
+
+    asm volatile("st.shared.v2.u32 [%0], {%1, %2};\n"
+                 :
+                 : "r"(dst_int), "r"(data[0].x), "r"(data[0].y));
+#else
+    uint2* dst_cast = reinterpret_cast<uint2*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void store_shared<4>(void* dst, const void* src)
+{
+    const int32_t* data = reinterpret_cast<const int32_t*>(src);
+#ifdef PTX_AVAILABLE
+    unsigned dst_int = internal::convert_to_shared(dst);
+
+    asm volatile("st.shared.u32 [%0], %1;\n" : : "r"(dst_int), "r"(*data));
+#else
+    int32_t* dst_cast = reinterpret_cast<int32_t*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+/////////// Asynchronous Memory Copy ///////////
+
+#ifdef ASYNC_COPY_AVAILABLE
+template <int AccessSize>
+__device__ __forceinline__ void memcpy_async(void* shr, const void* gbl)
+{
+    static_assert((AccessSize == 4 || AccessSize == 8 || AccessSize == 16));
+    unsigned shr_int = internal::convert_to_shared(shr);
+
+    asm volatile("cp.async.ca.shared.global [%0], [%1], %2;\n"
+                 :
+                 : "r"(shr_int), "l"(gbl), "n"(AccessSize));
+}
+
+template <int AccessSize>
+__device__ __forceinline__ void memcpy_async_nop(void* shr, const void* gbl, bool predicate)
+{
+    static_assert((AccessSize == 4 || AccessSize == 8 || AccessSize == 16));
+    unsigned shr_int = internal::convert_to_shared(shr);
+
+    asm volatile(
+        "{\n"
+        "   .reg .pred p;\n"
+        "   setp.ne.b32 p, %0, 0;\n"
+        "   @p cp.async.ca.shared.global [%1], [%2], %3;\n"
+        "}\n"
+        :
+        : "r"((int)predicate), "r"(shr_int), "l"(gbl), "n"(AccessSize));
+}
+
+template <int AccessSize>
+__device__ __forceinline__ void memcpy_async_zero(void* shr, const void* gbl, bool predicate)
+{
+    static_assert((AccessSize == 4 || AccessSize == 8 || AccessSize == 16));
+    unsigned shr_int = internal::convert_to_shared(shr);
+    int bytes_to_copy = (predicate ? AccessSize : 0);
+
+    asm volatile("cp.async.ca.shared.global [%0], [%1], %2, %3;\n"
+                 :
+                 : "r"(shr_int), "l"(gbl), "n"(AccessSize), "r"(bytes_to_copy));
+}
+
+template <int AccessSize>
+__device__ __forceinline__ void memcpy_async_zero_nop(void* shr,
+                                                      const void* gbl,
+                                                      bool zero_predicate,
+                                                      bool nop_predicate)
+{
+    static_assert((AccessSize == 4 || AccessSize == 8 || AccessSize == 16));
+    unsigned shr_int = internal::convert_to_shared(shr);
+    int bytes_to_copy = (zero_predicate ? AccessSize : 0);
+
+    asm volatile(
+        "{\n"
+        "   .reg .pred p;\n"
+        "   setp.ne.b32 p, %0, 0;\n"
+        "   @p cp.async.ca.shared.global [%1], [%2], %3, %4;\n"
+        "}\n"
+        :
+        : "r"((int)nop_predicate), "r"(shr_int), "l"(gbl), "n"(AccessSize), "r"(bytes_to_copy));
+}
+
+// Cache global variants. Separate interface to require deliberate use of them.
+__device__ __forceinline__ void memcpy_async_cg(void* shr, const void* gbl)
+{
+    unsigned shr_int = internal::convert_to_shared(shr);
+
+    asm volatile("cp.async.cg.shared.global [%0], [%1], 16;\n" : : "r"(shr_int), "l"(gbl));
+}
+
+__device__ __forceinline__ void memcpy_async_nop_cg(void* shr, const void* gbl, bool predicate)
+{
+    unsigned shr_int = internal::convert_to_shared(shr);
+
+    asm volatile(
+        "{\n"
+        "   .reg .pred p;\n"
+        "   setp.ne.b32 p, %0, 0;\n"
+        "   @p cp.async.cg.shared.global [%1], [%2], 16;\n"
+        "}\n"
+        :
+        : "r"((int)predicate), "r"(shr_int), "l"(gbl));
+}
+
+__device__ __forceinline__ void memcpy_async_zero_cg(void* shr, const void* gbl, bool predicate)
+{
+    unsigned shr_int = internal::convert_to_shared(shr);
+    int bytes_to_copy = (predicate ? 16 : 0);
+
+    asm volatile("cp.async.cg.shared.global [%0], [%1], 16, %2;\n"
+                 :
+                 : "r"(shr_int), "l"(gbl), "r"(bytes_to_copy));
+}
+
+__device__ __forceinline__ void memcpy_async_zero_nop_cg(void* shr,
+                                                         const void* gbl,
+                                                         bool zero_predicate,
+                                                         bool nop_predicate)
+{
+    unsigned shr_int = internal::convert_to_shared(shr);
+    int bytes_to_copy = (zero_predicate ? 16 : 0);
+
+    asm volatile(
+        "{\n"
+        "   .reg .pred p;\n"
+        "   setp.ne.b32 p, %0, 0;\n"
+        "   @p cp.async.cg.shared.global [%1], [%2], 16, %3;\n"
+        "}\n"
+        :
+        : "r"((int)nop_predicate), "r"(shr_int), "l"(gbl), "r"(bytes_to_copy));
+}
+
+__device__ __forceinline__ void memcpy_async_fence() { asm volatile("cp.async.commit_group;\n"); }
+
+template <int stages>
+__device__ __forceinline__ void memcpy_async_wait()
+{
+    static_assert(stages <= 8);
+
+    asm volatile("cp.async.wait_group %0;\n" : : "n"(stages));
+}
+
+// TODO: The tail complete should be a known compile time artifact, should try and induce this
+// without all of the branches from the call-site. This is a hacky solution.
+template <>
+__device__ __forceinline__ void tail_complete_wait<1>(int remaining_stages)
+{
+    if (remaining_stages == 0) memcpy_async_wait<0>();
+}
+
+template <>
+__device__ __forceinline__ void tail_complete_wait<2>(int remaining_stages)
+{
+    if (remaining_stages == 1)
+        memcpy_async_wait<1>();
+    else if (remaining_stages == 0)
+        memcpy_async_wait<0>();
+}
+
+template <>
+__device__ __forceinline__ void tail_complete_wait<3>(int remaining_stages)
+{
+    if (remaining_stages == 2)
+        memcpy_async_wait<2>();
+    else if (remaining_stages == 1)
+        memcpy_async_wait<1>();
+    else if (remaining_stages == 0)
+        memcpy_async_wait<0>();
+}
+
+template <>
+__device__ __forceinline__ void tail_complete_wait<4>(int remaining_stages)
+{
+    if (remaining_stages == 3)
+        memcpy_async_wait<3>();
+    else if (remaining_stages == 2)
+        memcpy_async_wait<2>();
+    else if (remaining_stages == 1)
+        memcpy_async_wait<1>();
+    else if (remaining_stages == 0)
+        memcpy_async_wait<0>();
+}
+
+template <>
+__device__ __forceinline__ void tail_complete_wait<5>(int remaining_stages)
+{
+    if (remaining_stages == 4)
+        memcpy_async_wait<4>();
+    else if (remaining_stages == 3)
+        memcpy_async_wait<3>();
+    else if (remaining_stages == 2)
+        memcpy_async_wait<2>();
+    else if (remaining_stages == 1)
+        memcpy_async_wait<1>();
+    else if (remaining_stages == 0)
+        memcpy_async_wait<0>();
+}
+
+template <>
+__device__ __forceinline__ void tail_complete_wait<6>(int remaining_stages)
+{
+    if (remaining_stages == 5)
+        memcpy_async_wait<5>();
+    else if (remaining_stages == 4)
+        memcpy_async_wait<4>();
+    else if (remaining_stages == 3)
+        memcpy_async_wait<3>();
+    else if (remaining_stages == 2)
+        memcpy_async_wait<2>();
+    else if (remaining_stages == 1)
+        memcpy_async_wait<1>();
+    else if (remaining_stages == 0)
+        memcpy_async_wait<0>();
+}
+#endif
+
+}  // namespace mem_access
diff --git a/csrc/includes/normalize_layer.h b/csrc/includes/normalize_layer.h
index b4d135ec495e61b041c0610deb966fb0468ca4a6..092129362f368cb0825b81c057de2ec533ee5f4c 100644
--- a/csrc/includes/normalize_layer.h
+++ b/csrc/includes/normalize_layer.h
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once
 
 #include <cuda.h>
diff --git a/csrc/includes/quantization.h b/csrc/includes/quantization.h
new file mode 100644
index 0000000000000000000000000000000000000000..89867748280c3fef2b14cd39d42f897725fafa16
--- /dev/null
+++ b/csrc/includes/quantization.h
@@ -0,0 +1,65 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
+#pragma once
+
+#include <cuda_fp16.h>
+#include "ds_kernel_utils.h"
+
+namespace quantize {
+
+enum class Type { Symmetric, Asymmetric };
+
+struct PackedInt4 {
+    int8_t high : 4;
+    int8_t low : 4;
+};
+
+DS_HD_INLINE bool requires_offset(Type qType) { return qType == Type::Asymmetric; }
+
+}  // namespace quantize
+
+void launch_quant(int8_t* output_data,
+                  float* params,
+                  const __half* input_data,
+                  const int groups,
+                  const int elems_per_group,
+                  const int num_bits,
+                  const quantize::Type quant_type,
+                  cudaStream_t stream);
+
+template <typename T>
+void launch_dequantize_kernel(T* dequant_data,
+                              const int8_t* q_data,
+                              const float* q_params,
+                              quantize::Type q_type,
+                              int num_bits,
+                              int elems_per_group,
+                              int total_elems,
+                              cudaStream_t stream);
+
+template <typename T>
+void launch_fake_quantize_kernel(T* vals,
+                                 int total_count,
+                                 int group_num,
+                                 int num_bits,
+                                 cudaStream_t stream);
+template <typename T>
+void launch_sr_fake_quantize_kernel(T* vals,
+                                    int total_count,
+                                    int group_num,
+                                    int num_bits,
+                                    cudaStream_t stream);
+template <typename T>
+void launch_fake_quantize_kernel_asym(T* vals,
+                                      int total_count,
+                                      int group_num,
+                                      int num_bits,
+                                      cudaStream_t stream);
+template <typename T>
+void launch_sr_fake_quantize_kernel_asym(T* vals,
+                                         int total_count,
+                                         int group_num,
+                                         int num_bits,
+                                         cudaStream_t stream);
diff --git a/csrc/includes/quantization_utils.h b/csrc/includes/quantization_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b14d1dc8f67a52b4d1484cf2ee76e6d9d38334e
--- /dev/null
+++ b/csrc/includes/quantization_utils.h
@@ -0,0 +1,467 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#include <cassert>
+#include "conversion_utils.h"
+#include "ds_kernel_utils.h"
+#include "memory_access_utils.h"
+#include "quantization.h"
+#include "reduction_utils.h"
+
+#pragma once
+
+using rop = reduce::ROpType;
+
+namespace quantize {
+constexpr int granularity = 16;
+constexpr int h_per_load = granularity / sizeof(__half);
+constexpr int h2_per_load = granularity / sizeof(__half2);
+constexpr int max_threads = 1024;
+
+/*
+Class to hold the quantization parameters for a given tensor.
+Holds the implementation of the quantization operation.
+*/
+template <Type qType, int numBits>
+class Params {
+public:
+    /*
+    Quantization implementation, supports
+    1) 4 Bit
+    2) 8 Bit
+    3) Symmetric
+    4) Asymmetric
+    Function Arguments :
+        val : The __half value to quantize.
+    */
+    DS_D_INLINE int8_t quantize(__half val);
+
+    template <typename T>
+    DS_D_INLINE T dequantize(int8_t val);
+
+    DS_D_INLINE void store(float* params, int group_index);
+
+    // Initialize from memory
+    DS_D_INLINE Params(const float* params, int group_index);
+};
+
+template <int numBits>
+class Params<Type::Symmetric, numBits> {
+public:
+    float scale;
+
+    DS_D_INLINE Params(float max)
+    {
+        if (max == 0) {
+            scale = 1.0;
+        } else {
+            scale = (1 << numBits) / (2 * max);
+        }
+    }
+
+    DS_D_INLINE int8_t quantize(__half val)
+    {
+        constexpr int32_t q_min = -(1 << (numBits - 1));
+        constexpr int32_t q_max = (1 << (numBits - 1)) - 1;
+
+        float val_f = conversion::to<float>(val) * scale;
+        int32_t data_i32 = conversion::to<int32_t>(val_f);
+        data_i32 = min(max(data_i32, q_min), q_max);
+        return (int8_t)data_i32;
+    }
+
+    template <typename T>
+    DS_D_INLINE T dequantize(int8_t val)
+    {
+        const float val_deq_f = conversion::to<float>(val) * scale;
+        return conversion::to<T>(val_deq_f);
+    }
+
+    DS_D_INLINE void store(float* params, int group_index)
+    {
+        const float store_scale = 1 / scale;
+        mem_access::store_global<sizeof(float)>(params + group_index, &store_scale);
+    }
+
+    DS_D_INLINE Params(const float* params, int group_index)
+    {
+        mem_access::load_global<sizeof(float)>(&scale, params + group_index);
+    }
+};
+
+template <int numBits>
+class Params<Type::Asymmetric, numBits> {
+public:
+    float scale;
+    float offset;
+
+    DS_D_INLINE Params(float max, float min)
+    {
+        if (max == min) {
+            scale = 1.0;
+        } else {
+            scale = (1 << numBits) / (max - min);
+        }
+        offset = -(1 << (numBits - 1)) - (min * scale);
+    }
+
+    DS_D_INLINE int8_t quantize(__half val)
+    {
+        constexpr int32_t q_min = -(1 << (numBits - 1));
+        constexpr int32_t q_max = (1 << (numBits - 1)) - 1;
+
+        float val_f = conversion::to<float>(val) * scale + offset;
+        int32_t data_i32 = conversion::to<int32_t>(val_f);
+        data_i32 = min(max(data_i32, q_min), q_max);
+        return (int8_t)data_i32;
+    }
+
+    template <typename T>
+    DS_D_INLINE T dequantize(int8_t val)
+    {
+        const float val_deq_f = conversion::to<float>(val) * scale + offset;
+        return conversion::to<__half>(val_deq_f);
+    }
+
+    DS_D_INLINE void store(float* params, int group_index)
+    {
+        // Codegen should turn this into stg.64
+        const float store_scale = 1 / scale;
+        mem_access::store_global<sizeof(float)>(params + 2 * group_index, &store_scale);
+        mem_access::store_global<sizeof(float)>(params + 2 * group_index + 1, &offset);
+    }
+
+    DS_D_INLINE Params(const float* params, int group_index)
+    {
+        // Codegen should turn this into ldg.64
+        mem_access::load_global<sizeof(float)>(&scale, params + 2 * group_index);
+        mem_access::load_global<sizeof(float)>(&offset, params + 2 * group_index + 1);
+    }
+};
+
+/*
+Group stats tracks the necessary statistics about the quantized group
+to abstract the particulars for the main loop.
+*/
+template <Type qType>
+class GroupStats {
+public:
+    DS_D_INLINE void update(__half2 val);
+
+    DS_D_INLINE void reduce(cg::thread_block& tb, cg::thread_block_tile<hw_warp_size>& warp);
+};
+
+template <>
+class GroupStats<Type::Symmetric> {
+public:
+    // Symmetric quantization only tracks the maximum absolute value
+    __half2 cur_max;
+    float max;
+
+    /*
+    Technically, this would give bad results if there
+    are 0 values to process since the reduction would
+    give -inf instead of 0. We do not consider this
+    to be a reasonable edge case.
+    */
+    DS_D_INLINE GroupStats() { cur_max = reduce::init<rop::Max, __half2>(); }
+
+    /*
+    Updated the running absmax used to calculate params.
+    Function Arguments :
+        val : The __half2 value to update the running min and max with.
+    */
+    DS_D_INLINE void update(__half2 val)
+    {
+        cur_max = reduce::element<rop::Max>(cur_max, __habs2(val));
+    }
+
+    /*
+    Function to return calculated quantization params.
+    Template Arguments :
+        numBits -   Number of bits in quantized element.    int : 8 or 4
+    Function Arguments :
+        tb      -   Threadblock object. cg::thread_block
+        warp    -   Warp object.        cg::thread_block_tile<hw_warp_size>
+    */
+    template <int numBits, int threads_per_group>
+    DS_D_INLINE Params<Type::Symmetric, numBits> get_params(
+        cg::thread_block& tb,
+        cg::thread_block_tile<hw_warp_size>& warp)
+    {
+        const float2 partial_max = conversion::to<float2>(cur_max);
+        float max = reduce::element<rop::Max>(partial_max.x, partial_max.y);
+
+        reduce::partitioned_block<rop::Max, threads_per_group>(tb, warp, max);
+        Params<Type::Symmetric, numBits> params(max);
+
+        return params;
+    }
+};
+
+template <>
+class GroupStats<Type::Asymmetric> {
+public:
+    __half2 cur_max;
+    __half2 cur_min;
+
+    /*
+    Initialize cur_max to -inf, cur_min to inf since
+    we are doing a true range analysis.
+    */
+    DS_D_INLINE GroupStats()
+    {
+        cur_max = reduce::init<rop::Max, __half2>();
+        cur_min = reduce::init<rop::Min, __half2>();
+    }
+
+    /*
+    Updated the running min and max used to calculate params.
+    Function Arguments :
+        val : The __half2 value to update the running min and max with.
+    */
+    DS_D_INLINE void update(__half2 val)
+    {
+        cur_max = reduce::element<rop::Max>(cur_max, val);
+        cur_min = reduce::element<rop::Min>(cur_min, val);
+    }
+
+    /*
+    Function to return calculated quantization params.
+    Template Arguments :
+        numBits -   Number of bits in quantized element.    int : 8 or 4
+    Function Arguments :
+        tb      -   Threadblock object. cg::thread_block
+        warp    -   Warp object.        cg::thread_block_tile<hw_warp_size>
+    */
+    template <int numBits, int threads_per_group>
+    DS_D_INLINE Params<Type::Asymmetric, numBits> get_params(
+        cg::thread_block& tb,
+        cg::thread_block_tile<hw_warp_size>& warp)
+    {
+        const float2 partial_max = conversion::to<float2>(cur_max);
+        float max = reduce::element<rop::Max>(partial_max.x, partial_max.y);
+
+        const float2 partial_min = conversion::to<float2>(cur_min);
+        float min = reduce::element<rop::Min>(partial_min.x, partial_min.y);
+
+        reduce::partitioned_block<rop::Max, rop::Min, threads_per_group>(tb, warp, max, min);
+
+        Params<Type::Asymmetric, numBits> params(max, min);
+
+        return params;
+    }
+};
+
+/*
+Device function that quantizes 16 bytes of __half type input data.
+Template Arguments :
+    numBits -   Number of bits in quantized element.    int : 8 or 4
+    qType   - Type of quantization to perform.          Type::Symmetric or Type::Asymmetric
+Function Arguments :
+    local_output -  Pointer to local memory to store quantized data.    int8_t*
+    data         -  Pointer to input data.                              __half*
+    Params       -  Parameters for quantization.                        Params<qType, numBits>
+*/
+template <int numBits, Type qType>
+DS_D_INLINE void _chunk(int8_t* local_output, const __half* data, Params<qType, numBits> q_params);
+
+/*
+Device function that quantizes 16 bytes of __half2 type input data.
+Template Arguments :
+    numBits -   Number of bits in quantized element.    int : 8 or 4
+    qType   -   Type of quantization to perform.        Type::Symmetric or Type::Asymmetric
+Function Arguments :
+    local_output -  Pointer to local memory to store quantized data.    int8_t*
+    data         -  Pointer to input data.                              __half2*
+    Params       -  Parameters for quantization.                        Params<qType, numBits>
+*/
+template <int numBits, Type qType>
+DS_D_INLINE void _chunk(int8_t* local_output, const __half2* data, Params<qType, numBits> q_params);
+
+/*
+Helper function to do serial reduction on register-file arrays.
+Template Arguments :
+    qType       -   Type of quantization to perform.        Type::Symmetric or Type::Asymmetric
+    numChunks   -   Number of bits in quantized element.    int : 8 or 4
+Function Arguments :
+    local_buffer    -   Pointer memory with input half2 data to be quantized.
+*/
+template <Type qType, int numChunks>
+DS_D_INLINE GroupStats<qType> _local_serial_reduce(__half2* local_buffer);
+
+/*
+The main loop of the kernel that quantizes array in local memory of __half2 type input data, when
+Quantization parameters are pre-computed.
+Template Arguments :
+    qType       -   Type of quantization to perform.            Type::Symmetric or Type::Asymmetric
+    numBits     -   Number of bits in quantized element.        int : 8 or 4
+    numChunks   -   Number of chunks(16 bytes of Input data).   int : 8 or 4
+Function Arguments :
+    local_buffer    -   Pointer memory with input half2 data to be quantized.
+    scales          -   Pointer to output scales.
+    offsets         -   Pointer to output offsets.
+    output_data     -   Pointer to output data.
+    elems_per_group -   Number of elements to quantize in a group.
+    q_params        -   Quantization parameters.
+*/
+template <int numBits, Type qType, int numChunks, int threads_per_group, int max_threads>
+DS_D_INLINE void local_array(cg::thread_block& tb,
+                             cg::thread_block_tile<hw_warp_size>& warp,
+                             __half2* local_buffer,
+                             float* __restrict__ scales,
+                             float* __restrict__ offsets,
+                             int8_t* __restrict__ output_data,
+                             const int& elems_per_group,
+                             const int& groups,
+                             Params<qType, numBits> q_params);
+
+/*
+The main loop of the kernel that quantizes array in local memory of __half2 type input data.
+This function computes quantization parameters for each group.
+Template Arguments :
+    qType   -   Type of quantization to perform.                Type::Symmetric or Type::Asymmetric
+    numBits     -   Number of bits in quantized element.        int : 8 or 4
+    numChunks   -   Number of chunks(16 bytes of Input data).   int : 8 or 4
+Function Arguments :
+    local_buffer    -   Pointer memory with input half2 data to be quantized.
+    scales          -   Pointer to output scales.
+    offsets         -   Pointer to output offsets.
+    output_data     -   Pointer to output data.
+    elems_per_group -   Number of elements to quantize in a group.
+*/
+template <Type qType, int numBits, int numChunks, int threads_per_group, int max_threads>
+__device__ void local_array(__half2* local_buffer,
+                            float* __restrict__ scales,
+                            float* __restrict__ offsets,
+                            int8_t* __restrict__ output_data,
+                            const int& elems_per_group,
+                            const int& groups);
+
+template <int numBits, Type qType>
+DS_D_INLINE void _chunk(int8_t* local_output, const __half* data, Params<qType, numBits> q_params)
+{
+    constexpr int32_t elems = 16 / sizeof(__half);
+    constexpr int32_t num_elems_packed = 8 / numBits;
+
+#pragma unroll
+    for (int i = 0, oi = 0; i < elems; i += num_elems_packed, oi++) {
+        if (num_elems_packed == 1) {
+            // TODO(cmikeh2): refactor to use conversion utils
+            local_output[i] = q_params.quantize(data[i]);
+        } else if (num_elems_packed == 2) {
+            int8_t data_i8_1 = q_params.quantize(data[i]);
+            int8_t data_i8_2 = q_params.quantize(data[i + 1]);
+            auto data_i8 = PackedInt4{data_i8_2, data_i8_1};
+            local_output[oi] = *((int8_t*)(&data_i8));
+        }
+    }
+}
+
+template <int numBits, Type qType>
+DS_D_INLINE void _chunk(int8_t* local_output, const __half2* data, Params<qType, numBits> q_params)
+{
+    const __half* data_cast = reinterpret_cast<const __half*>(data);
+    _chunk<numBits>(local_output, data_cast, q_params);
+}
+
+template <Type qType, int numChunks>
+DS_D_INLINE GroupStats<qType> _local_serial_reduce(__half2* local_buffer)
+{
+    GroupStats<qType> stats;
+#pragma unroll
+    for (int i = 0; i < numChunks * h2_per_load; i++) { stats.update(local_buffer[i]); }
+
+    return stats;
+}
+
+template <Type qType, int numBits, int numChunks, int threads_per_group, int max_threads>
+DS_D_INLINE void local_array(cg::thread_block& tb,
+                             cg::thread_block_tile<hw_warp_size>& warp,
+                             __half2* local_buffer,
+                             float* __restrict__ global_params,
+                             int8_t* __restrict__ output_data,
+                             const int& elems_per_group,
+                             const int& groups,
+                             Params<qType, numBits> q_params)
+{
+    constexpr int num_ele_int8 = 8 / numBits;
+    constexpr int num_int8_out = quantize::h_per_load / num_ele_int8;
+
+    // Indexing offsets
+    const int block_num =
+        (tb.group_index().x * max_threads / threads_per_group) + tb.thread_index().y;
+    const int block_offset = block_num * elems_per_group;
+    const int elem_offset = tb.thread_index().x * quantize::h_per_load;
+    const int base_offset = (block_offset + elem_offset) / num_ele_int8;
+    const int stride = tb.size() * quantize::h_per_load / num_ele_int8;
+
+    int8_t local_output[num_int8_out];
+
+    if (tb.thread_index().x == 0 && block_num < groups) {
+        q_params.store(
+            global_params,
+            (tb.group_index().x * max_threads / threads_per_group) + tb.thread_index().y);
+    }
+#pragma unroll
+    for (int i = 0; i < numChunks; i++) {
+        if (elem_offset + i * stride * num_ele_int8 < elems_per_group && block_num < groups) {
+            quantize::_chunk<numBits, qType>(
+                local_output, local_buffer + i * quantize::h2_per_load, q_params);
+            mem_access::store_global<num_int8_out>(output_data + (base_offset + i * stride),
+                                                   local_output);
+        }
+    }
+}
+
+template <Type qType, int numBits, int numChunks, int threads_per_group, int max_threads>
+DS_D_INLINE void local_array(cg::thread_block& tb,
+                             cg::thread_block_tile<hw_warp_size>& warp,
+                             __half* local_buffer,
+                             float* __restrict__ global_params,
+                             int8_t* __restrict__ output_data,
+                             const int& elems_per_group,
+                             const int& groups,
+                             Params<qType, numBits> q_params)
+{
+    __half2* local_buffer_h2 = reinterpret_cast<__half2*>(local_buffer);
+
+    quantize::local_array<qType, numBits, numChunks, threads_per_group, max_threads>(
+        tb, warp, local_buffer, global_params, output_data, elems_per_group, groups, q_params);
+}
+
+template <Type qType,
+          int numBits,
+          int numChunks,
+          int threads_per_group = max_threads,
+          int max_threads = 256>
+__device__ void local_array(__half2* local_buffer,
+                            float* __restrict__ global_params,
+                            int8_t* __restrict__ output_data,
+                            const int& elems_per_group,
+                            const int& groups)
+{
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    auto group_stats = _local_serial_reduce<qType, numChunks>(local_buffer);
+    auto params = group_stats.template get_params<numBits, threads_per_group>(tb, warp);
+
+    quantize::local_array<qType, numBits, numChunks, threads_per_group, max_threads>(
+        tb, warp, local_buffer, global_params, output_data, elems_per_group, groups, params);
+}
+
+template <Type qType, int numBits, int numChunks, int threads_per_group, int max_threads>
+__device__ void local_array(__half* local_buffer,
+                            float* __restrict__ global_params,
+                            int8_t* __restrict__ output_data,
+                            const int& elems_per_group,
+                            const int& groups)
+{
+    __half2* local_buffer_h2 = reinterpret_cast<__half2*>(local_buffer);
+    quantize::local_array<qType, numBits, numChunks, threads_per_group, max_threads>(
+        local_buffer_h2, global_params, output_data, elems_per_group, groups);
+}
+
+}  // namespace quantize
diff --git a/csrc/includes/quantizer.h b/csrc/includes/quantizer.h
index 79eeb14e2a0613255c61f647df44f04fbe9df6c6..fe363c809607e8d805bae70355f12f7828295e7a 100644
--- a/csrc/includes/quantizer.h
+++ b/csrc/includes/quantizer.h
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once
 
 #include <cooperative_groups.h>
diff --git a/csrc/includes/reduction_utils.h b/csrc/includes/reduction_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..fabf19dea9111219b29cc3778a4b416f555c82f3
--- /dev/null
+++ b/csrc/includes/reduction_utils.h
@@ -0,0 +1,591 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#pragma once
+
+#include "conversion_utils.h"
+#include "ds_kernel_utils.h"
+#include "memory_access_utils.h"
+
+namespace cg = cooperative_groups;
+
+namespace reduce {
+
+enum class ROpType {
+    // Addition
+    Add,
+
+    // Maximum reduction
+    Max,
+
+    // Minimum reduction
+    Min,
+};
+
+constexpr int max_threads = 1024;
+constexpr int max_warps = max_threads / hw_warp_size;
+
+/*
+High level API. The API takes in a set of operations and variables
+and performs that reduction operation on that variable. The reductions
+of each of the arguments are completely independent of each other (
+i.e., the val1-op1 combination has no impact on val2-op2).
+
+Example usage:
+``` cpp
+float max_val;
+float min_val;
+reduce::block<rop::Max, rop::Min>(tb, warp, max_val, min_val);
+```
+
+TODO(cmikeh2): In theory, we might be able to do this sequentially with
+device functions and rely on the assembler correctly behaving. My initial
+instinct is this won't work, but if it does it would reduce implementation
+cost significantly.
+
+TODO(cmikeh2): We need to support sub-block reductions. The warp intrinsic
+currently supports this (more incidentally than anything else). It is not
+uncommon in something like softmax or a fused attention kernel to map multiple
+reductions to a thread block, but each reduction itself is only scoped
+to part of the threads (i.e block size = 512, 128 threads per reduction).
+*/
+template <ROpType Op, int warp_bound = max_warps>
+DS_D_INLINE void block(cg::thread_block& tb, cg::thread_block_tile<hw_warp_size>& warp, float& val);
+
+template <ROpType Op1, ROpType Op2, int warp_bound = max_warps>
+DS_D_INLINE void block(cg::thread_block& tb,
+                       cg::thread_block_tile<hw_warp_size>& warp,
+                       float& val1,
+                       float& val2);
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, int warp_bound = max_warps>
+DS_D_INLINE void block(cg::thread_block& tb,
+                       cg::thread_block_tile<hw_warp_size>& warp,
+                       float& val1,
+                       float& val2,
+                       float& val3);
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, ROpType Op4, int warp_bound = max_warps>
+DS_D_INLINE void block(cg::thread_block& tb,
+                       cg::thread_block_tile<hw_warp_size>& warp,
+                       float& val1,
+                       float& val2,
+                       float& val3,
+                       float& val4);
+
+/*
+The partitioned block is a special case of the above where in the warps of a threadblock are
+partitioned into separate independent reductions. For example, I might have an 8 warp thread block
+in which each pair of warps is processing an independent piece of data. I would then reduce that
+data with the something like the following:
+``` cpp
+float max_val;
+reduce::partitioned_block<rop::Max, 2>(tb, warp, max_val);
+```
+After which, each pair of warps would have coherent data with each other. Note, this API will not
+provide correct results if the number of warps per partition is not a power of 2.
+*/
+template <ROpType Op, int num_threads>
+DS_D_INLINE void partitioned_block(cg::thread_block& tb,
+                                   cg::thread_block_tile<hw_warp_size>& warp,
+                                   float& val);
+
+template <ROpType Op1, ROpType Op2, int num_threads>
+DS_D_INLINE void partitioned_block(cg::thread_block& tb,
+                                   cg::thread_block_tile<hw_warp_size>& warp,
+                                   float& val1,
+                                   float& val2);
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, int num_threads>
+DS_D_INLINE void partitioned_block(cg::thread_block& tb,
+                                   cg::thread_block_tile<hw_warp_size>& warp,
+                                   float& val1,
+                                   float& val2,
+                                   float& val3);
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, ROpType Op4, int num_threads>
+DS_D_INLINE void partitioned_block(cg::thread_block& tb,
+                                   cg::thread_block_tile<hw_warp_size>& warp,
+                                   float& val1,
+                                   float& val2,
+                                   float& val3,
+                                   float& val4);
+
+/*
+Single element reduction primitives. Used inside serial collection
+loops.
+
+Example usage:
+using rop = reduce::OpType;
+float min = init<rop::Min>();
+for (int i = 0; i < 4; i++) {
+    min = reduce::element<rop::Min>(min, data[i]);
+}
+*/
+
+template <ROpType Op, typename T>
+DS_D_INLINE T element(const T lhs, const T rhs);
+
+template <ROpType OType, typename T = float>
+DS_D_INLINE T init();
+
+/********************** Internal reduction APIs **********************/
+
+/*
+Single element "reductions". TODO(cmikeh2): this sort of "op" concept
+should be refactored into its own implementation at some point. This interface
+may be easily expanded for new types/operations, but the typical reductions
+we need are covered with min/max/add on float.
+
+NOTE: there is no mean reduction because that relies on knowledge of how
+many values were already reduced into each scalar. Implementing this on top
+of reduce should be straightforward (can just wrap the sum reduction) and
+would be a good extension of the header.
+*/
+
+/* Float element reduce implementations */
+template <>
+DS_D_INLINE float element<ROpType::Add>(const float lhs, const float rhs)
+{
+    return lhs + rhs;
+}
+
+template <>
+DS_D_INLINE float element<ROpType::Max>(const float lhs, const float rhs)
+{
+    return fmaxf(lhs, rhs);
+}
+
+template <>
+DS_D_INLINE float element<ROpType::Min>(const float lhs, const float rhs)
+{
+    return fminf(lhs, rhs);
+}
+
+/* __half element reduce implementation */
+template <>
+DS_D_INLINE __half element<ROpType::Add>(const __half lhs, const __half rhs)
+{
+    return lhs + rhs;
+}
+
+template <>
+DS_D_INLINE __half element<ROpType::Max>(const __half lhs, const __half rhs)
+{
+#if __CUDA_ARCH__ >= 800
+    // Intrinsic limited to Ampere + newer
+    return __hmax(lhs, rhs);
+#else
+    return (lhs > rhs) ? lhs : rhs;
+#endif
+}
+
+template <>
+DS_D_INLINE __half element<ROpType::Min>(const __half lhs, const __half rhs)
+{
+#if __CUDA_ARCH__ >= 800
+    // Intrinsic limited to Ampere + newer
+    return __hmin(lhs, rhs);
+#else
+    return (lhs < rhs) ? lhs : rhs;
+#endif
+}
+
+/* __half2 element reduce implementation */
+template <>
+DS_D_INLINE __half2 element<ROpType::Add>(const __half2 lhs, const __half2 rhs)
+{
+    return lhs + rhs;
+}
+
+template <>
+DS_D_INLINE __half2 element<ROpType::Max>(const __half2 lhs, const __half2 rhs)
+{
+#if __CUDA_ARCH__ >= 800
+    return __hmax2(lhs, rhs);
+#else
+    __half2 ret_val;
+    ret_val.x = (lhs.x > rhs.x) ? lhs.x : rhs.x;
+    ret_val.y = (lhs.y > rhs.y) ? lhs.y : rhs.y;
+    return ret_val;
+#endif
+}
+
+template <>
+DS_D_INLINE __half2 element<ROpType::Min>(const __half2 lhs, const __half2 rhs)
+{
+#if __CUDA_ARCH__ >= 800
+    return __hmin2(lhs, rhs);
+#else
+    __half2 ret_val;
+    ret_val.x = (lhs.x < rhs.x) ? lhs.x : rhs.x;
+    ret_val.y = (lhs.y < rhs.y) ? lhs.y : rhs.y;
+    return ret_val;
+#endif
+}
+
+/*
+Reduction initialization primitives
+*/
+template <>
+DS_D_INLINE float init<ROpType::Add>()
+{
+    return 0.0f;
+}
+
+template <>
+DS_D_INLINE float init<ROpType::Min>()
+{
+    // Positive infinity
+    return INFINITY;
+}
+
+template <>
+DS_D_INLINE float init<ROpType::Max>()
+{
+    // Negative infinity
+    return -INFINITY;
+}
+
+template <>
+DS_D_INLINE __half init<ROpType::Add>()
+{
+    constexpr __half_raw zero = {0x0000};
+    return __half(zero);
+}
+
+template <>
+DS_D_INLINE __half init<ROpType::Min>()
+{
+    constexpr __half_raw inf = {0x7C00};
+    return __half(inf);
+}
+
+template <>
+DS_D_INLINE __half init<ROpType::Max>()
+{
+    constexpr __half_raw neg_inf = {0xFC00};
+    return __half(neg_inf);
+}
+
+template <>
+DS_D_INLINE __half2 init<ROpType::Add>()
+{
+    constexpr __half2_raw zero = {0x0000, 0x0000};
+    return __half2(zero);
+}
+
+template <>
+DS_D_INLINE __half2 init<ROpType::Min>()
+{
+    constexpr __half2_raw inf = {0x7C00, 0x7C00};
+    return __half2(inf);
+}
+
+template <>
+DS_D_INLINE __half2 init<ROpType::Max>()
+{
+    constexpr __half2_raw neg_inf = {0xFC00, 0xFC00};
+    return __half2(neg_inf);
+}
+
+template <ROpType Op, typename T>
+DS_D_INLINE void init(T* data)
+{
+    data[0] = init<Op, T>();
+}
+
+template <ROpType Op1, ROpType Op2, typename T>
+DS_D_INLINE void init(T* data)
+{
+    data[0] = init<Op1, T>();
+    data[1] = init<Op2, T>();
+}
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, typename T>
+DS_D_INLINE void init(T* data)
+{
+    data[0] = init<Op1, T>();
+    data[1] = init<Op2, T>();
+    data[2] = init<Op3, T>();
+}
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, ROpType Op4, typename T>
+DS_D_INLINE void init(T* data)
+{
+    data[0] = init<Op1, T>();
+    data[1] = init<Op2, T>();
+    data[2] = init<Op3, T>();
+    data[3] = init<Op4, T>();
+}
+
+/*
+Warp reduction primitives
+
+`reduction_width` is an unsafe template parameter, that is that
+when using `reduction_width` < hw_warp_size the warp is partitioned
+into `hw_warp_size` / `reduction_width` groups of partial sums.
+
+If someone can figure out how to use variadic templates in a reasonable way
+here (fold is C++17 only and I don't think helps and recursion feels like
+huge overkill that harms readability) that would be wonderful.
+*/
+
+template <ROpType Op, int reduce_width = hw_warp_size>
+DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, float* data)
+{
+#pragma unroll
+    for (int i = 1; i < reduce_width; i *= 2) {
+        data[0] = element<Op>(data[0], warp.shfl_xor(data[0], i));
+    }
+}
+
+template <ROpType Op1, ROpType Op2, int reduce_width = hw_warp_size>
+DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, float* data)
+{
+#pragma unroll
+    for (int i = 1; i < reduce_width; i *= 2) {
+        data[0] = element<Op1>(data[0], warp.shfl_xor(data[0], i));
+        data[1] = element<Op2>(data[1], warp.shfl_xor(data[1], i));
+    }
+}
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, int reduce_width = hw_warp_size>
+DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, float* data)
+{
+#pragma unroll
+    for (int i = 1; i < reduce_width; i *= 2) {
+        data[0] = element<Op1>(data[0], warp.shfl_xor(data[0], i));
+        data[1] = element<Op2>(data[1], warp.shfl_xor(data[1], i));
+        data[2] = element<Op3>(data[2], warp.shfl_xor(data[2], i));
+    }
+}
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, ROpType Op4, int reduce_width = hw_warp_size>
+DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, float* data)
+{
+#pragma unroll
+    for (int i = 1; i < reduce_width; i *= 2) {
+        data[0] = element<Op1>(data[0], warp.shfl_xor(data[0], i));
+        data[1] = element<Op2>(data[1], warp.shfl_xor(data[1], i));
+        data[2] = element<Op3>(data[2], warp.shfl_xor(data[2], i));
+        data[3] = element<Op4>(data[3], warp.shfl_xor(data[3], i));
+    }
+}
+
+/*
+Implementation for primary block reduction that serves both `block` and
+`partitioned_block`.
+
+`local_warp_rank` refers to the warp's location within the partition, so
+for an unpartitioned threadblock this will be equivalent to
+`warp_arg.meta_group_rank()`.
+
+Similarly, the warp offset is the `local_warp_rank` of the warp with the
+lowest rank in the partition. In the case of an 8 warp block with a
+4 warp reduction, this would map to [0, 0, 0, 0, 4, 4, 4, 4].
+
+Partition size is the number of warps per partition (equal to the thread
+block in the default case). This enables us to only perform the warp reduction
+when able to.
+*/
+template <int total_warps, ROpType... Ops>
+DS_D_INLINE void _block(cg::thread_block& tb,
+                        cg::thread_block_tile<hw_warp_size>& warp_arg,
+                        float* data,
+                        int warp_offset)
+{
+    constexpr int elems = sizeof...(Ops);
+    // Separated for now in case this no longer is true
+    constexpr int bytes = sizeof(float);
+    // Unused when `partition_size == 1` or total_warps == 1
+    __shared__ float reduce_buffer[max_warps * elems];
+
+    // Always perform warp-scope reduction
+    _warp<Ops...>(warp_arg, data);
+
+    // If max_warps == 1 let's skip the runtime check
+    if (warp_arg.meta_group_size() > 1 && total_warps != 1) {
+        if (warp_arg.thread_rank() == 0) {
+#pragma unroll
+            for (int i = 0; i < elems; i++) {
+                mem_access::store_shared<bytes>(
+                    reduce_buffer + elems * warp_arg.meta_group_rank() + i, data + i);
+            }
+        }
+
+        // Synchronization inside block-uniform conditional is safe
+        tb.sync();
+
+        if (warp_arg.meta_group_rank() == 0) {
+            if (warp_arg.thread_rank() < warp_arg.meta_group_size()) {
+#pragma unroll
+                for (int i = 0; i < elems; i++) {
+                    mem_access::load_shared<bytes>(
+                        data + i, reduce_buffer + elems * warp_arg.thread_rank() + i);
+                }
+            } else {
+                init<Ops...>(data);
+            }
+
+            _warp<Ops..., total_warps>(warp_arg, data);
+
+#pragma unroll
+            for (int i = 0; i < elems; i++) {
+                mem_access::store_shared<bytes>(reduce_buffer + elems * warp_arg.thread_rank() + i,
+                                                data + i);
+            }
+        }
+
+        // Synchronization inside block-uniform conditional is safe
+        tb.sync();
+
+#pragma unroll
+        for (int i = 0; i < elems; i++) {
+            mem_access::load_shared<bytes>(data + i,
+                                           reduce_buffer + warp_arg.meta_group_rank() * elems + i);
+        }
+    }
+}
+
+/*
+Main API implementations. For the most part, they just convert the individual
+variables into arrays, which makes working with them easier with a single
+implementation. In theory, we could use the `_block` implementation as another
+option, but the nature of using a pointer is a little less safe and this allows
+us to obfuscate the details of the partitioned implementation.
+*/
+template <ROpType Op, int warp_bound>
+DS_D_INLINE void block(cg::thread_block& tb, cg::thread_block_tile<hw_warp_size>& warp, float& val)
+{
+    _block<warp_bound, Op>(tb, warp, &val, 0);
+}
+
+template <ROpType Op1, ROpType Op2, int warp_bound>
+DS_D_INLINE void block(cg::thread_block& tb,
+                       cg::thread_block_tile<hw_warp_size>& warp,
+                       float& val1,
+                       float& val2)
+{
+    float data[2] = {val1, val2};
+    _block<warp_bound, Op1, Op2>(tb, warp, data, 0);
+    val1 = data[0];
+    val2 = data[1];
+}
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, int warp_bound>
+DS_D_INLINE void block(cg::thread_block& tb,
+                       cg::thread_block_tile<hw_warp_size>& warp,
+                       float& val1,
+                       float& val2,
+                       float& val3)
+{
+    float data[3] = {val1, val2, val3};
+    _block<warp_bound, Op1, Op2, Op3>(tb, warp, data, 0);
+    val1 = data[0];
+    val2 = data[1];
+    val3 = data[2];
+}
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, ROpType Op4, int warp_bound>
+DS_D_INLINE void block(cg::thread_block& tb,
+                       cg::thread_block_tile<hw_warp_size>& warp,
+                       float& val1,
+                       float& val2,
+                       float& val3,
+                       float& val4)
+{
+    float data[4] = {val1, val2, val3, val4};
+    _block<warp_bound, Op1, Op2, Op3, Op4>(tb, warp, data, 0);
+    val1 = data[0];
+    val2 = data[1];
+    val3 = data[2];
+    val4 = data[3];
+}
+
+/*
+Note: for the partitioned blocks, the implementation does not support non-power of 2 blocks in order
+to shorten block scale reduction length.
+*/
+template <ROpType Op, int num_threads>
+DS_D_INLINE void partitioned_block(cg::thread_block& tb,
+                                   cg::thread_block_tile<hw_warp_size>& warp,
+                                   float& val)
+{
+    if (num_threads <= hw_warp_size) {
+        _warp<Op, num_threads>(warp, &val);
+    } else {
+        constexpr int num_warps = num_threads / hw_warp_size;
+        const int warp_offset = warp.meta_group_rank() & ~(num_warps - 1);
+        _block<num_warps, Op>(tb, warp, &val, warp_offset);
+    }
+}
+
+template <ROpType Op1, ROpType Op2, int num_threads>
+DS_D_INLINE void partitioned_block(cg::thread_block& tb,
+                                   cg::thread_block_tile<hw_warp_size>& warp,
+                                   float& val1,
+                                   float& val2)
+{
+    float data[2] = {val1, val2};
+
+    if (num_threads <= hw_warp_size) {
+        _warp<Op1, Op2, num_threads>(warp, data);
+    } else {
+        constexpr int num_warps = num_threads / hw_warp_size;
+        const int warp_offset = warp.meta_group_rank() & ~(num_warps - 1);
+        _block<num_warps, Op1, Op2>(tb, warp, data, warp_offset);
+    }
+
+    val1 = data[0];
+    val2 = data[1];
+}
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, int num_threads>
+DS_D_INLINE void partitioned_block(cg::thread_block& tb,
+                                   cg::thread_block_tile<hw_warp_size>& warp,
+                                   float& val1,
+                                   float& val2,
+                                   float& val3)
+{
+    float data[3] = {val1, val2, val3};
+
+    if (num_threads <= hw_warp_size) {
+        _warp<Op1, Op2, Op3, num_threads>(warp, data);
+    } else {
+        constexpr int num_warps = num_threads / hw_warp_size;
+        const int warp_offset = warp.meta_group_rank() & ~(num_warps - 1);
+        _block<num_warps, Op1, Op2, Op3>(tb, warp, data, warp_offset);
+    }
+
+    val1 = data[0];
+    val2 = data[1];
+    val3 = data[2];
+}
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, ROpType Op4, int num_threads>
+DS_D_INLINE void partitioned_block(cg::thread_block& tb,
+                                   cg::thread_block_tile<hw_warp_size>& warp,
+                                   float& val1,
+                                   float& val2,
+                                   float& val3,
+                                   float& val4)
+{
+    float data[4] = {val1, val2, val3, val4};
+
+    if (num_threads <= hw_warp_size) {
+        _warp<Op1, Op2, Op3, Op4, num_threads>(warp, data);
+    } else {
+        constexpr int num_warps = num_threads / hw_warp_size;
+        const int warp_offset = warp.meta_group_rank() & ~(num_warps - 1);
+        _block<num_warps, Op1, Op2, Op3, Op4>(tb, warp, data, warp_offset);
+    }
+
+    val1 = data[0];
+    val2 = data[1];
+    val3 = data[2];
+    val4 = data[3];
+}
+
+}  // namespace reduce
diff --git a/csrc/includes/simd.h b/csrc/includes/simd.h
index 44c4da83e01c02dc664aa712fe6847be0b3f6aa2..61f6ffdf3cbcc99628ce03bd482e6835ee95437d 100644
--- a/csrc/includes/simd.h
+++ b/csrc/includes/simd.h
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once
 
 #if (__x86_64__ || __i386__)
@@ -22,7 +26,7 @@
 #define SIMD_WIDTH 16
 
 #define SIMD_LOAD2(x, h) \
-    ((h) ? _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)x)) : _mm512_loadu_ps(x))
+    ((h) ? _mm512_cvtph_ps(_mm256_castps_si256(_mm256_loadu_ps(x))) : _mm512_loadu_ps(x))
 #define SIMD_STORE2(x, d, h)                                                                      \
     ((h) ? _mm256_store_ps(x, _mm256_castsi256_ps(_mm512_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
          : _mm512_storeu_ps(x, d))
@@ -60,18 +64,16 @@ union AVX_Data {
 template <int span>
 inline void simd_store(float* dst, AVX_Data* src, bool half_precision)
 {
+    size_t width = (half_precision ? SIMD_WIDTH / 2 : SIMD_WIDTH);
 #pragma unroll
-    for (size_t i = 0; i < span; ++i) {
-        SIMD_STORE2(dst + SIMD_WIDTH * i, src[i].data, half_precision);
-    }
+    for (size_t i = 0; i < span; ++i) { SIMD_STORE2(dst + width * i, src[i].data, half_precision); }
 }
 template <int span>
 inline void simd_load(AVX_Data* dst, float* src, bool half_precision)
 {
+    size_t width = (half_precision ? 1 : SIMD_WIDTH);
 #pragma unroll
-    for (size_t i = 0; i < span; ++i) {
-        dst[i].data = SIMD_LOAD2(src + SIMD_WIDTH * i, half_precision);
-    }
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_LOAD2(src + width * i, half_precision); }
 }
 template <int span>
 inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data src_m_r, AVX_Data* src_a)
diff --git a/csrc/includes/softmax.h b/csrc/includes/softmax.h
old mode 100644
new mode 100755
index 8d541a6fe0e691522a3add63bfd92e2620c914d3..676a35d5292ab4462d755828d8a0034fa3c8b518
--- a/csrc/includes/softmax.h
+++ b/csrc/includes/softmax.h
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once
 
 #include <cuda.h>
diff --git a/csrc/includes/strided_batch_gemm.h b/csrc/includes/strided_batch_gemm.h
index 037319ba0dd963de0aca8327250eeffaf870bcd0..6a1d65cfc7baeb94d8f4dcc246fdb67ebac4acbb 100644
--- a/csrc/includes/strided_batch_gemm.h
+++ b/csrc/includes/strided_batch_gemm.h
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once
 
 #include <cuda.h>
diff --git a/csrc/includes/type_shim.h b/csrc/includes/type_shim.h
index 4f4e7a539ac15d6931a6ac3dfd541c6bd2f6760d..9d28019be87745781058e493f491e9973caef23f 100644
--- a/csrc/includes/type_shim.h
+++ b/csrc/includes/type_shim.h
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 /* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */
 #include <ATen/ATen.h>
 
diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
index c76632362cb3e12abe5ac95a6b8889ea25a61697..a957eb6287515d26f96400936e0ed1aa8f54b0e8 100644
--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
@@ -7,12 +7,12 @@
 #include "ATen/TensorUtils.h"
 #include "ATen/cuda/CUDAContext.h"
 #include "ATen/cuda/detail/IndexUtils.cuh"
-//#include "ATen/Type.h"
+// #include "ATen/Type.h"
 #include "ATen/AccumulateType.h"
 
 #include <iostream>
 
-//#include <helper_functions.h>
+// #include <helper_functions.h>
 #if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
 #include <hip/hip_cooperative_groups.h>
 #else
@@ -105,7 +105,7 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
 
     cg::sync(cta);
 
-#if (__CUDA_ARCH__ >= 300)
+#if (__CUDA_ARCH__ >= 300) || (defined(__HIP_PLATFORM_HCC__) && HIP_VERSION >= 502)
     if (tid < 32) {
         cg::coalesced_group active = cg::coalesced_threads();
 
diff --git a/csrc/quantization/dequantize.cu b/csrc/quantization/dequantize.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3185b5f9c98ca619353ffcc330d4441f7a1a105b
--- /dev/null
+++ b/csrc/quantization/dequantize.cu
@@ -0,0 +1,71 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#include "dequantization_utils.h"
+#include "memory_access_utils.h"
+
+namespace cg = cooperative_groups;
+
+template <typename T, int numBits, dequantize::Type qType, int unroll, int threads>
+__global__ void dequantize_kernel(T* __restrict__ dequant_data,
+                                  const int8_t* __restrict__ q_data,
+                                  const float* __restrict__ q_params,
+                                  int elems_per_group,
+                                  int total_elems)
+{
+    dequantize::to_global<T, numBits, qType, unroll, threads>(
+        dequant_data, q_data, q_params, elems_per_group, total_elems);
+}
+
+#define LAUNCH_DEQUANT_KERNEL(num_bits, q_type)                                          \
+    dequantize_kernel<T, num_bits, q_type, unroll, threads><<<grid, block, 0, stream>>>( \
+        dequant_data, q_data, q_params, elems_per_group, total_elems);
+
+template <typename T>
+void launch_dequantize_kernel(T* dequant_data,
+                              const int8_t* q_data,
+                              const float* q_params,
+                              quantize::Type q_type,
+                              int num_bits,
+                              int elems_per_group,
+                              int total_elems,
+                              cudaStream_t stream)
+{
+    constexpr int unroll = 8;
+    constexpr int threads = 512;
+    constexpr int elems_per_block = unroll * threads * dequantize::granularity / (sizeof(T));
+
+    const dim3 block(threads);
+    const dim3 grid((total_elems + elems_per_block - 1) / elems_per_block);
+
+    // TODO(cmikeh2): It may make sense to tune unroll, there is perf benefit for large
+    // problem sizes with this large unroll value.
+    if (num_bits == 8 && q_type == quantize::Type::Symmetric) {
+        LAUNCH_DEQUANT_KERNEL(8, quantize::Type::Symmetric);
+    } else if (num_bits == 8 && q_type == quantize::Type::Asymmetric) {
+        LAUNCH_DEQUANT_KERNEL(8, quantize::Type::Asymmetric);
+    } else if (num_bits == 4 && q_type == quantize::Type::Symmetric) {
+        LAUNCH_DEQUANT_KERNEL(4, quantize::Type::Symmetric);
+    } else if (num_bits == 4 && q_type == quantize::Type::Asymmetric) {
+        LAUNCH_DEQUANT_KERNEL(4, quantize::Type::Asymmetric);
+    }
+}
+
+template void launch_dequantize_kernel(__half* dequant_data,
+                                       const int8_t* q_data,
+                                       const float* q_params,
+                                       quantize::Type q_type,
+                                       int num_bits,
+                                       int elems_per_group,
+                                       int total_elems,
+                                       cudaStream_t stream);
+
+template void launch_dequantize_kernel(float* dequant_data,
+                                       const int8_t* q_data,
+                                       const float* q_params,
+                                       quantize::Type q_type,
+                                       int num_bits,
+                                       int elems_per_group,
+                                       int total_elems,
+                                       cudaStream_t stream);
diff --git a/deepspeed/ops/csrc/quantization/quantizer.cu b/csrc/quantization/fake_quantizer.cu
similarity index 81%
rename from deepspeed/ops/csrc/quantization/quantizer.cu
rename to csrc/quantization/fake_quantizer.cu
index 37883410e976c6daaece041d86a8a6e78838a00d..5d23c6ea4a6bce86d6437fc02d7d82d544878fb5 100644
--- a/deepspeed/ops/csrc/quantization/quantizer.cu
+++ b/csrc/quantization/fake_quantizer.cu
@@ -1,99 +1,92 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #include <math.h>
 #include "custom_cuda_layers.h"
+#include "memory_access_utils.h"
 
 namespace cg = cooperative_groups;
 
-__global__ void quantize_kernel(__half* vals, int group_size, int num_bits)
+__global__ void fake_quantize_kernel(__half* vals, int group_size, int num_bits)
 {
 #if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+    cg::thread_block b = cg::this_thread_block();  // tb
+    cg::thread_block_tile<32> g =
+        cg::tiled_partition<32>(b);  // warp, 32 not optimal for AMD which should be 64.
 
     int gid = threadIdx.x >> 5;
     int lane = threadIdx.x & 0x1f;
     int warp_num = blockDim.x >> 5;
     int id = threadIdx.x;
 
-    float2* vals_cast = reinterpret_cast<float2*>(vals);
+    constexpr int granularity = 16;
+    constexpr int vals_per_access = granularity / sizeof(__half);
 
-    float2 data[MAX_REG];
+    __half data[vals_per_access];
 
     int group_id = blockIdx.x;
 
-    {
-        int group_index = id;
-        int reg_count = 0;
-        int offset = group_id * group_size;
-        float max = -10000.0;
-
-        while (group_index < group_size && reg_count < MAX_REG) {
-            data[reg_count] = vals_cast[offset + group_index];
-            __half* data_h = reinterpret_cast<__half*>(&data[reg_count]);
-
-            if (abs((float)data_h[0]) > max) max = abs((float)data_h[0]);
-            if (abs((float)data_h[1]) > max) max = abs((float)data_h[1]);
-            if (abs((float)data_h[2]) > max) max = abs((float)data_h[2]);
-            if (abs((float)data_h[3]) > max) max = abs((float)data_h[3]);
-
-            group_index += blockDim.x;
-            reg_count++;
-        }
+    int thread_index = id * vals_per_access;
+    int reg_count = 0;
+    int offset = group_id * group_size;
+    float max = -10000.0;
+    for (int thread_index = id * vals_per_access; thread_index < group_size;
+         thread_index += blockDim.x * vals_per_access) {
+        mem_access::load_global<granularity>(data, vals + offset + thread_index);
 
 #pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_xor(max, i);
-            if (max < temp) max = temp;
+        for (int i = 0; i < vals_per_access; i++) {
+            if (abs((float)data[i]) > max) max = abs((float)data[i]);
         }
-        __shared__ float partialMax[WARP_SIZE];
-
-        if (lane == 0) partialMax[gid] = max;
-
-        b.sync();
-
-        if (lane < warp_num) max = partialMax[lane];
+    }
 
 #pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_down(max, i);
-            if (max < temp) max = temp;
-        }
+    for (int i = 1; i < WARP_SIZE; i <<= 1) {
+        auto temp = g.shfl_xor(max, i);
+        if (max < temp) max = temp;
+    }
+    __shared__ float partialMax[WARP_SIZE];
 
-        max = g.shfl(max, 0);
+    if (lane == 0) partialMax[gid] = max;
 
-        float q_scale = (1 << num_bits) / (2 * max + 1e-5);
-        float q_scale_inv = 1 / q_scale;
-        for (int i = 0; i < reg_count; i++) {
-            group_index = i * blockDim.x + id;
-            if (group_index < group_size) {
-                __half2* data_h = reinterpret_cast<__half2*>(&data[i]);
-                float2 q_data[2];
-                q_data[0] = __half22float2(data_h[0]);
-                q_data[1] = __half22float2(data_h[1]);
+    b.sync();
 
-                float2 q_data_int[2];
+    if (lane < warp_num) max = partialMax[lane];
 
-                q_data_int[0].x = roundf(q_data[0].x * q_scale);
-                q_data_int[0].y = roundf(q_data[0].y * q_scale);
-                q_data_int[1].x = roundf(q_data[1].x * q_scale);
-                q_data_int[1].y = roundf(q_data[1].y * q_scale);
+#pragma unroll
+    for (int i = 1; i < WARP_SIZE; i <<= 1) {
+        auto temp = g.shfl_down(max, i);
+        if (max < temp) max = temp;
+    }
 
-                q_data_int[0].x *= q_scale_inv;
-                q_data_int[0].y *= q_scale_inv;
-                q_data_int[1].x *= q_scale_inv;
-                q_data_int[1].y *= q_scale_inv;
+    max = g.shfl(max, 0);
 
-                data_h[0] = __float22half2_rn(q_data_int[0]);
-                data_h[1] = __float22half2_rn(q_data_int[1]);
+    float q_scale = (float)(1 << num_bits) / (2 * max + 1e-5);
+    float q_scale_inv = 1 / q_scale;
+    int q_range_max = (1 << (num_bits - 1)) - 1;
+    int q_range_min = -(1 << (num_bits - 1));
 
-                vals_cast[offset + group_index] = data[i];
-            }
+    for (int thread_index = id * vals_per_access; thread_index < group_size;
+         thread_index += blockDim.x * vals_per_access) {
+        mem_access::load_global<granularity>(data, vals + offset + thread_index);
+#pragma unroll
+        for (int j = 0; j < vals_per_access; j++) {
+            float q_data;
+            q_data = __half2float(data[j]);
+            q_data = __float2int_rn(q_data * q_scale);
+            q_data = q_data > (q_range_max) ? (q_range_max)
+                                            : (q_data < (q_range_min) ? (q_range_min) : q_data);
+            data[j] = __float2half_rn(q_data * q_scale_inv);
         }
+        mem_access::store_global<granularity>(vals + offset + thread_index, data);
     }
+
 #endif
 }
 
-__global__ void quantize_kernel(float* vals, int group_size, int num_bits)
+__global__ void fake_quantize_kernel(float* vals, int group_size, int num_bits)
 {
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -103,31 +96,31 @@ __global__ void quantize_kernel(float* vals, int group_size, int num_bits)
     int warp_num = blockDim.x >> 5;
     int id = threadIdx.x;
 
-    float4* vals_cast = reinterpret_cast<float4*>(vals);
+    constexpr int granularity = 16;
+    constexpr int vals_per_access = granularity / sizeof(float);
 
-    float4 data[MAX_REG];
+    float data[vals_per_access];
 
     int bid = blockIdx.x;
 
-    int group_index = bid * group_size + id;
+    int thread_index = id * vals_per_access;
+
     int reg_count = 0;
 
-    float max = -10000.0;
+    int offset = bid * group_size;
 
-    while (id < group_size && reg_count < MAX_REG) {
-        float4 data_reg = vals_cast[group_index];
-        data[reg_count] = data_reg;
+    float max = -10000.0;
 
-        if (abs(data_reg.x) > max) max = abs(data_reg.x);
-        if (abs(data_reg.y) > max) max = abs(data_reg.y);
-        if (abs(data_reg.z) > max) max = abs(data_reg.z);
-        if (abs(data_reg.w) > max) max = abs(data_reg.w);
+    for (int thread_index = id * vals_per_access; thread_index < group_size;
+         thread_index += blockDim.x * vals_per_access) {
+        mem_access::load_global<granularity>(data, vals + offset + thread_index);
 
-        group_index += blockDim.x;
-        id += blockDim.x;
-        reg_count++;
+#pragma unroll
+        for (int i = 0; i < vals_per_access; i++) {
+            if (abs(data[i]) > max) max = abs(data[i]);
+        }
     }
-    id = threadIdx.x;
+
 #pragma unroll
     for (int i = 1; i < WARP_SIZE; i <<= 1) {
         auto temp = g.shfl_xor(max, i);
@@ -153,58 +146,55 @@ __global__ void quantize_kernel(float* vals, int group_size, int num_bits)
 
     float q_scale = (1 << num_bits) / (2 * max + 1e-5);
     float q_scale_inv = 1 / q_scale;
-    for (int i = 0; i < reg_count; i++) {
-        group_index = i * blockDim.x + id;
-        if (group_index < group_size) {
-            float4 q_data;
-            q_data = data[i];
 
-            float4 q_data_int;
-            q_data_int.x = roundf(q_data.x * q_scale);
-            q_data_int.y = roundf(q_data.y * q_scale);
-            q_data_int.w = roundf(q_data.w * q_scale);
-            q_data_int.z = roundf(q_data.z * q_scale);
-
-            q_data.x = q_data_int.x * q_scale_inv;
-            q_data.y = q_data_int.y * q_scale_inv;
-            q_data.w = q_data_int.w * q_scale_inv;
-            q_data.z = q_data_int.z * q_scale_inv;
+    int q_range_max = (1 << (num_bits - 1)) - 1;
+    int q_range_min = -(1 << (num_bits - 1));
 
-            vals_cast[group_index + bid * group_size] = q_data;
+    for (int thread_index = id * vals_per_access; thread_index < group_size;
+         thread_index += blockDim.x * vals_per_access) {
+        mem_access::load_global<granularity>(data, vals + offset + thread_index);
+#pragma unroll
+        for (int j = 0; j < vals_per_access; j++) {
+            float q_data;
+            q_data = __float2int_rn(data[j] * q_scale);
+            q_data = q_data > (q_range_max) ? (q_range_max)
+                                            : (q_data < (q_range_min) ? (q_range_min) : q_data);
+            data[j] = roundf(q_data * q_scale_inv);
         }
+        mem_access::store_global<granularity>(vals + offset + thread_index, data);
     }
 }
 
 template <typename T>
-void launch_quantize_kernel(T* vals,
-                            int total_count,
-                            int group_num,
-                            int num_bits,
-                            cudaStream_t stream)
+void launch_fake_quantize_kernel(T* vals,
+                                 int total_count,
+                                 int group_num,
+                                 int num_bits,
+                                 cudaStream_t stream)
 {
     dim3 grid_dim(group_num);
     dim3 block_dim(1024);
 
-    quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        vals, (total_count / group_num) / 4, num_bits);
+    fake_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
+        vals, total_count / group_num, num_bits);
 }
 
-template void launch_quantize_kernel(float* vals,
-                                     int total_count,
-                                     int group_num,
-                                     int num_bits,
-                                     cudaStream_t stream);
-template void launch_quantize_kernel(__half* vals,
-                                     int total_count,
-                                     int group_num,
-                                     int num_bits,
-                                     cudaStream_t stream);
-
-__global__ void sr_quantize_kernel(__half* vals,
-                                   int token_size,
-                                   int token_num,
-                                   int num_bits,
-                                   std::pair<uint64_t, uint64_t> seed)
+template void launch_fake_quantize_kernel(float* vals,
+                                          int total_count,
+                                          int group_num,
+                                          int num_bits,
+                                          cudaStream_t stream);
+template void launch_fake_quantize_kernel(__half* vals,
+                                          int total_count,
+                                          int group_num,
+                                          int num_bits,
+                                          cudaStream_t stream);
+
+__global__ void sr_fake_quantize_kernel(__half* vals,
+                                        int token_size,
+                                        int token_num,
+                                        int num_bits,
+                                        std::pair<uint64_t, uint64_t> seed)
 {
 #if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
@@ -336,11 +326,11 @@ __global__ void sr_quantize_kernel(__half* vals,
 #endif
 }
 
-__global__ void sr_quantize_kernel(float* vals,
-                                   int token_size,
-                                   int token_num,
-                                   int num_bits,
-                                   std::pair<uint64_t, uint64_t> seed)
+__global__ void sr_fake_quantize_kernel(float* vals,
+                                        int token_size,
+                                        int token_num,
+                                        int num_bits,
+                                        std::pair<uint64_t, uint64_t> seed)
 {
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -456,11 +446,11 @@ __global__ void sr_quantize_kernel(float* vals,
 }
 
 template <typename T>
-void launch_sr_quantize_kernel(T* vals,
-                               int total_count,
-                               int group_num,
-                               int num_bits,
-                               cudaStream_t stream)
+void launch_sr_fake_quantize_kernel(T* vals,
+                                    int total_count,
+                                    int group_num,
+                                    int num_bits,
+                                    cudaStream_t stream)
 {
     dim3 block_dim(1024);
     dim3 grid_dim(group_num);
@@ -468,21 +458,21 @@ void launch_sr_quantize_kernel(T* vals,
     uint64_t inc = total_count / grid_dim.x / block_dim.x;
     std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
 
-    sr_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
+    sr_fake_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
         vals, (total_count / group_num) / 4, group_num, num_bits, seed);
 }
-template void launch_sr_quantize_kernel(float* vals,
-                                        int total_count,
-                                        int group_num,
-                                        int num_bits,
-                                        cudaStream_t stream);
-template void launch_sr_quantize_kernel(__half* vals,
-                                        int total_count,
-                                        int group_num,
-                                        int num_bits,
-                                        cudaStream_t stream);
+template void launch_sr_fake_quantize_kernel(float* vals,
+                                             int total_count,
+                                             int group_num,
+                                             int num_bits,
+                                             cudaStream_t stream);
+template void launch_sr_fake_quantize_kernel(__half* vals,
+                                             int total_count,
+                                             int group_num,
+                                             int num_bits,
+                                             cudaStream_t stream);
 
-__global__ void quantize_kernel_asym(__half* vals, int group_size, int num_bits)
+__global__ void fake_quantize_kernel_asym(__half* vals, int group_size, int num_bits)
 {
 #if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
@@ -595,7 +585,7 @@ __global__ void quantize_kernel_asym(__half* vals, int group_size, int num_bits)
 #endif
 }
 
-__global__ void quantize_kernel_asym(float* vals, int group_size, int num_bits)
+__global__ void fake_quantize_kernel_asym(float* vals, int group_size, int num_bits)
 {
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -699,35 +689,35 @@ __global__ void quantize_kernel_asym(float* vals, int group_size, int num_bits)
 }
 
 template <typename T>
-void launch_quantize_kernel_asym(T* vals,
-                                 int total_count,
-                                 int group_num,
-                                 int num_bits,
-                                 cudaStream_t stream)
+void launch_fake_quantize_kernel_asym(T* vals,
+                                      int total_count,
+                                      int group_num,
+                                      int num_bits,
+                                      cudaStream_t stream)
 {
     dim3 grid_dim(group_num);
     dim3 block_dim(1024);
 
-    quantize_kernel_asym<<<grid_dim, block_dim, 0, stream>>>(
+    fake_quantize_kernel_asym<<<grid_dim, block_dim, 0, stream>>>(
         vals, (total_count / group_num) / 4, num_bits);
 }
 
-template void launch_quantize_kernel_asym(float* vals,
-                                          int total_count,
-                                          int group_num,
-                                          int num_bits,
-                                          cudaStream_t stream);
-template void launch_quantize_kernel_asym(__half* vals,
-                                          int total_count,
-                                          int group_num,
-                                          int num_bits,
-                                          cudaStream_t stream);
-
-__global__ void sr_quantize_kernel_asym(__half* vals,
-                                        int token_size,
-                                        int token_num,
-                                        int num_bits,
-                                        std::pair<uint64_t, uint64_t> seed)
+template void launch_fake_quantize_kernel_asym(float* vals,
+                                               int total_count,
+                                               int group_num,
+                                               int num_bits,
+                                               cudaStream_t stream);
+template void launch_fake_quantize_kernel_asym(__half* vals,
+                                               int total_count,
+                                               int group_num,
+                                               int num_bits,
+                                               cudaStream_t stream);
+
+__global__ void sr_fake_quantize_kernel_asym(__half* vals,
+                                             int token_size,
+                                             int token_num,
+                                             int num_bits,
+                                             std::pair<uint64_t, uint64_t> seed)
 {
 #if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
@@ -879,11 +869,11 @@ __global__ void sr_quantize_kernel_asym(__half* vals,
 #endif
 }
 
-__global__ void sr_quantize_kernel_asym(float* vals,
-                                        int token_size,
-                                        int token_num,
-                                        int num_bits,
-                                        std::pair<uint64_t, uint64_t> seed)
+__global__ void sr_fake_quantize_kernel_asym(float* vals,
+                                             int token_size,
+                                             int token_num,
+                                             int num_bits,
+                                             std::pair<uint64_t, uint64_t> seed)
 {
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -1010,11 +1000,11 @@ __global__ void sr_quantize_kernel_asym(float* vals,
     }
 }
 template <typename T>
-void launch_sr_quantize_kernel_asym(T* vals,
-                                    int total_count,
-                                    int group_num,
-                                    int num_bits,
-                                    cudaStream_t stream)
+void launch_sr_fake_quantize_kernel_asym(T* vals,
+                                         int total_count,
+                                         int group_num,
+                                         int num_bits,
+                                         cudaStream_t stream)
 {
     dim3 block_dim(1024);
     dim3 grid_dim(group_num);
@@ -1022,16 +1012,16 @@ void launch_sr_quantize_kernel_asym(T* vals,
     uint64_t inc = total_count / grid_dim.x / block_dim.x;
     std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
 
-    sr_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
+    sr_fake_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
         vals, (total_count / group_num) / 4, group_num, num_bits, seed);
 }
-template void launch_sr_quantize_kernel_asym(float* vals,
-                                             int total_count,
-                                             int group_num,
-                                             int num_bits,
-                                             cudaStream_t stream);
-template void launch_sr_quantize_kernel_asym(__half* vals,
-                                             int total_count,
-                                             int group_num,
-                                             int num_bits,
-                                             cudaStream_t stream);
+template void launch_sr_fake_quantize_kernel_asym(float* vals,
+                                                  int total_count,
+                                                  int group_num,
+                                                  int num_bits,
+                                                  cudaStream_t stream);
+template void launch_sr_fake_quantize_kernel_asym(__half* vals,
+                                                  int total_count,
+                                                  int group_num,
+                                                  int num_bits,
+                                                  cudaStream_t stream);
diff --git a/csrc/quantization/pt_binding.cpp b/csrc/quantization/pt_binding.cpp
index f76c4368a20090d1821776fed4877fa021db57c5..1465cabd296140114661a1a1ef7b3a003827eb83 100644
--- a/csrc/quantization/pt_binding.cpp
+++ b/csrc/quantization/pt_binding.cpp
@@ -1,7 +1,8 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>
+#include <cassert>
 #include <vector>
-#include "custom_cuda_layers.h"
+#include "quantization.h"
 
 template <typename T>
 at::Tensor ds_quantize(at::Tensor& vals, int groups, int bits)
@@ -10,8 +11,8 @@ at::Tensor ds_quantize(at::Tensor& vals, int groups, int bits)
     int size = 1;
     for (auto dim : t_size) size *= dim;
 
-    if ((((size / groups) - 1) / 4096 + 1) <= MAX_REG) {
-        launch_quantize_kernel(
+    if ((((size / groups) - 1) / 4096 + 1) <= 256) {
+        launch_fake_quantize_kernel(
             (T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
     }
     return vals;
@@ -25,7 +26,7 @@ at::Tensor ds_sr_quantize(at::Tensor& vals, int groups, int bits)
     for (auto dim : t_size) size *= dim;
 
     if (((size / groups) / 4 / 1024) <= 256) {
-        launch_sr_quantize_kernel(
+        launch_sr_fake_quantize_kernel(
             (T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
     }
     return vals;
@@ -38,8 +39,8 @@ at::Tensor ds_quantize_asym(at::Tensor& vals, int groups, int bits)
     int size = 1;
     for (auto dim : t_size) size *= dim;
 
-    if ((((size / groups) - 1) / 4096 + 1) <= MAX_REG) {
-        launch_quantize_kernel_asym(
+    if ((((size / groups) - 1) / 4096 + 1) <= 256) {
+        launch_fake_quantize_kernel_asym(
             (T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
     }
     return vals;
@@ -53,12 +54,83 @@ at::Tensor ds_sr_quantize_asym(at::Tensor& vals, int groups, int bits)
     for (auto dim : t_size) size *= dim;
 
     if (((size / groups) / 4 / 1024) <= 256) {
-        launch_sr_quantize_kernel_asym(
+        launch_sr_fake_quantize_kernel_asym(
             (T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
     }
     return vals;
 }
 
+std::vector<at::Tensor> quantize_kernel(at::Tensor& input_vals,
+                                        int groups,
+                                        int numBits,
+                                        quantize::Type quantType)
+{
+    auto dtype = at::kFloat;
+    auto params_options = at::TensorOptions()
+                              .dtype(dtype)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+    const int param_elems = (quantize::requires_offset(quantType)) ? 2 : 1;
+    auto params = torch::empty({groups, param_elems}, params_options);
+
+    auto output_options = at::TensorOptions()
+                              .dtype(at::kChar)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+
+    auto output_sizes = input_vals.sizes().vec();
+    output_sizes[output_sizes.size() - 1] /= numBits == 8 ? 1 : 2;
+    auto output = torch::empty(output_sizes, output_options);
+
+    const int elems_per_group = at::numel(input_vals) / groups;
+
+    launch_quant((int8_t*)output.data_ptr(),
+                 (float*)params.data_ptr(),
+                 (__half*)input_vals.data_ptr(),
+                 groups,
+                 elems_per_group,
+                 numBits,
+                 quantType,
+                 at::cuda::getCurrentCUDAStream());
+
+    return {output, params};
+}
+
+template <typename T>
+at::Tensor dequantize(at::Tensor& quantized_data,
+                      at::Tensor& params,
+                      int groups,
+                      int num_bits,
+                      quantize::Type quant_type)
+{
+    auto dtype = (std::is_same<T, float>::value) ? torch::kFloat32 : torch::kFloat16;
+    auto output_options = at::TensorOptions()
+                              .dtype(dtype)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+
+    auto output_sizes = quantized_data.sizes().vec();
+    output_sizes[output_sizes.size() - 1] *= num_bits == 8 ? 1 : 2;
+    auto output = torch::empty(output_sizes, output_options);
+
+    const int total_elems = at::numel(output);
+    const int elems_per_group = total_elems / groups;
+
+    launch_dequantize_kernel((T*)output.data_ptr(),
+                             (const int8_t*)quantized_data.data_ptr(),
+                             (const float*)params.data_ptr(),
+                             quant_type,
+                             num_bits,
+                             elems_per_group,
+                             total_elems,
+                             at::cuda::getCurrentCUDAStream());
+
+    return output;
+}
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("ds_quantize_fp32", &ds_quantize<float>, "DeepSpeed Quantize with fp32 (CUDA)");
@@ -74,4 +146,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     m.def("ds_sr_quantize_asym_fp16",
           &ds_sr_quantize_asym<__half>,
           "DeepSpeed Quantize with fp16 (CUDA)");
+    pybind11::enum_<quantize::Type>(m, "QuantizationType")
+        .value("Symmetric", quantize::Type::Symmetric)
+        .value("Asymmetric", quantize::Type::Asymmetric)
+        .export_values();
+    m.def("quantize", &quantize_kernel);
+    m.def("dequantize", &dequantize<__half>);
+    m.def("dequantize_fp32", &dequantize<float>);
 }
diff --git a/csrc/quantization/quantize.cu b/csrc/quantization/quantize.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b9c925c19f8759c62da634c8a8b8f74974ac4cc7
--- /dev/null
+++ b/csrc/quantization/quantize.cu
@@ -0,0 +1,150 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#include "ds_kernel_utils.h"
+#include "memory_access_utils.h"
+#include "quantization.h"
+#include "quantization_utils.h"
+#include "reduction_utils.h"
+
+namespace cg = cooperative_groups;
+
+/*
+Pure quantization kernel with no fusion.
+*/
+template <int q_bits,
+          quantize::Type quant_type,
+          int UNROLL,
+          int internal_unroll,
+          int threads_per_group,
+          int max_threads>
+__global__ void cached_quantization(int8_t* __restrict__ output_data,
+                                    float* __restrict__ params,
+                                    const __half* __restrict__ input_data,
+                                    int groups,
+                                    int elems_per_group)
+{
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    // Indexing offsets
+    const int block_offset =
+        (tb.group_index().x * (max_threads / threads_per_group) * elems_per_group) +
+        (tb.thread_index().y * elems_per_group);
+    const int elem_offset = tb.thread_index().x * quantize::h_per_load;
+    const int base_offset = block_offset + elem_offset;
+    const int stride = tb.size() * quantize::h_per_load;
+
+    const __half* input_base = input_data + base_offset;  //..
+
+    __half2 local_buffer[UNROLL * internal_unroll * quantize::h2_per_load];
+
+#pragma unroll
+    for (int i = 0; i < UNROLL; i++) {
+        // Convenience helper, should resolve to register indices and not realize.
+        __half2* iteration_buffer = local_buffer + i * internal_unroll * quantize::h2_per_load;
+#pragma unroll
+        for (int j = 0; j < internal_unroll; j++) {
+            const int iteration = i * internal_unroll + j;
+            mem_access::load_global<quantize::granularity>(
+                iteration_buffer + j * quantize::h2_per_load,
+                input_base + iteration * stride,
+                elem_offset + iteration * stride < elems_per_group);
+        }
+    }
+
+    quantize::
+        local_array<quant_type, q_bits, UNROLL * internal_unroll, threads_per_group, max_threads>(
+            local_buffer, params, output_data, elems_per_group, groups);
+}
+
+/********* Launcher methods ***********/
+#define LAUNCH_CACHED_QUANT_CALL(q_bits, quant_type) \
+    cached_quantization<q_bits,                      \
+                        quant_type,                  \
+                        unroll_factor,               \
+                        internal_unroll_l,           \
+                        threads_per_group,           \
+                        max_threads>                 \
+        <<<grid, block, 0, stream>>>(output_data, params, input_data, groups, elems_per_group);
+
+#define LAUNCH_CACHED_QUANT(                                                        \
+    q_bits, quant_type, unroll_factor_in, internal_unroll_in, threads_per_group_in) \
+    const int unroll_factor = unroll_factor_in;                                     \
+    const int internal_unroll_l = internal_unroll_in;                               \
+    const int threads_per_group = threads_per_group_in;                             \
+    if (q_bits == 4) {                                                              \
+        if (quant_type == quantize::Type::Asymmetric) {                             \
+            LAUNCH_CACHED_QUANT_CALL(4, quantize::Type::Asymmetric)                 \
+        } else {                                                                    \
+            LAUNCH_CACHED_QUANT_CALL(4, quantize::Type::Symmetric)                  \
+        }                                                                           \
+    } else {                                                                        \
+        if (quant_type == quantize::Type::Asymmetric) {                             \
+            LAUNCH_CACHED_QUANT_CALL(8, quantize::Type::Asymmetric)                 \
+        } else {                                                                    \
+            LAUNCH_CACHED_QUANT_CALL(8, quantize::Type::Symmetric)                  \
+        }                                                                           \
+    }
+
+void launch_quant(int8_t* output_data,
+                  float* params,
+                  const __half* input_data,
+                  const int groups,
+                  const int elems_per_group,
+                  const int num_bits,
+                  const quantize::Type quant_type,
+                  cudaStream_t stream)
+{
+    constexpr int max_threads = 256;
+
+    constexpr int internal_unroll = 2;
+
+    const bool is_subblock_schedule = (elems_per_group <= 128) ? true : false;
+    const int h_per_step = is_subblock_schedule ? quantize::h_per_load
+                                                : quantize::h_per_load * internal_unroll;
+
+    // Scheduling concern: may be slightly faster for some inputs to assign multiple stages of
+    // warp-sized blocks rather than stepping up to 64/96 threads
+    const int one_step_threads = next_pow2((elems_per_group + h_per_step - 1) / h_per_step);
+    const int threads_per_group = (one_step_threads < max_threads) ? one_step_threads : max_threads;
+
+    const int groups_per_block =
+        is_subblock_schedule ? (max_threads + threads_per_group - 1) / threads_per_group : 1;
+    const int groups_launch = (groups_per_block + groups - 1) / groups_per_block;
+
+    dim3 block(threads_per_group, groups_per_block);
+    dim3 grid(groups_launch);
+
+    const int elems_per_step = threads_per_group * h_per_step;
+    const int external_unroll = (elems_per_group + elems_per_step - 1) / elems_per_step;
+
+    if (is_subblock_schedule) {
+        // <=128
+        if (threads_per_group == 1) {
+            LAUNCH_CACHED_QUANT(num_bits, quant_type, 1, 1, 1);
+        } else if (threads_per_group == 2) {
+            LAUNCH_CACHED_QUANT(num_bits, quant_type, 1, 1, 2);
+        } else if (threads_per_group == 4) {
+            LAUNCH_CACHED_QUANT(num_bits, quant_type, 1, 1, 4);
+        } else if (threads_per_group == 8) {
+            LAUNCH_CACHED_QUANT(num_bits, quant_type, 1, 1, 8);
+        } else if (threads_per_group == 16) {
+            LAUNCH_CACHED_QUANT(num_bits, quant_type, 1, 1, 16);
+        }
+    } else if (external_unroll == 1) {
+        // 129 - 4096 elems
+        // (this can launch with 1-7 warps as well)
+        LAUNCH_CACHED_QUANT(num_bits, quant_type, 1, internal_unroll, max_threads);
+    } else if (external_unroll == 2) {
+        // 4097 - 8192 elems
+        LAUNCH_CACHED_QUANT(num_bits, quant_type, 2, internal_unroll, max_threads);
+    } else if (external_unroll == 3) {
+        // 8193 - 12288 elems
+        LAUNCH_CACHED_QUANT(num_bits, quant_type, 3, internal_unroll, max_threads);
+    } else if (external_unroll == 4) {
+        // 12289 - 16384 elems
+        LAUNCH_CACHED_QUANT(num_bits, quant_type, 4, internal_unroll, max_threads);
+    }
+}
diff --git a/csrc/random_ltd/gather_scatter.cu b/csrc/random_ltd/gather_scatter.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8da74796ed1f626643f1ac534cc31915f3edcfad
--- /dev/null
+++ b/csrc/random_ltd/gather_scatter.cu
@@ -0,0 +1,185 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#include "custom_cuda_layers.h"
+#include "memory_access_utils.h"
+
+namespace cg = cooperative_groups;
+
+namespace td_data {
+constexpr int granularity = 16;
+}
+
+template <typename T>
+__global__ void gather_tokens_impl(T* retained_tokens,
+                                   const T* activations,
+                                   int32_t* gather_indices,
+                                   int32_t sampled_tokens,
+                                   int32_t channels,
+                                   int32_t read_batch_stride,
+                                   int32_t read_seq_stride,
+                                   int32_t write_batch_stride,
+                                   int32_t write_seq_stride)
+{
+    constexpr int mem_vals_t = td_data::granularity / sizeof(T);
+
+    cg::thread_block tb = cg::this_thread_block();
+
+    const int gather_idx = gather_indices[tb.group_index().x * sampled_tokens + tb.group_index().y];
+
+    const int read_offset = read_batch_stride * tb.group_index().x + read_seq_stride * gather_idx;
+    const int write_offset =
+        write_batch_stride * tb.group_index().x + write_seq_stride * tb.group_index().y;
+
+    for (int i = tb.thread_index().x * mem_vals_t; i < channels; i += blockDim.x * mem_vals_t) {
+        T local_data[mem_vals_t];
+        mem_access::load_global<td_data::granularity>(local_data, activations + read_offset + i);
+        mem_access::store_global<td_data::granularity>(retained_tokens + write_offset + i,
+                                                       local_data);
+    }
+}
+
+template <typename T>
+void launch_gather_tokens(T* retained_tokens,
+                          T* activations,
+                          int32_t* gather_indices,
+                          int32_t batch_size,
+                          int32_t sampled_tokens,
+                          int32_t channels,
+                          int32_t read_batch_stride,
+                          int32_t read_seq_stride,
+                          int32_t write_batch_stride,
+                          int32_t write_seq_stride,
+                          cudaStream_t stream)
+{
+    constexpr int mem_vals_t = td_data::granularity / sizeof(T);
+
+    const int load_steps = (channels + mem_vals_t - 1) / mem_vals_t;
+    const int threads = (load_steps >= 1024) ? 1024 : load_steps;
+
+    dim3 block(threads);
+    dim3 grid(batch_size, sampled_tokens);
+
+    gather_tokens_impl<T><<<grid, block, 0, stream>>>(retained_tokens,
+                                                      activations,
+                                                      gather_indices,
+                                                      sampled_tokens,
+                                                      channels,
+                                                      read_batch_stride,
+                                                      read_seq_stride,
+                                                      write_batch_stride,
+                                                      write_seq_stride);
+}
+
+template void launch_gather_tokens<float>(float*,
+                                          float*,
+                                          int32_t*,
+                                          int32_t,
+                                          int32_t,
+                                          int32_t,
+                                          int32_t,
+                                          int32_t,
+                                          int32_t,
+                                          int32_t,
+                                          cudaStream_t);
+
+template void launch_gather_tokens<__half>(__half*,
+                                           __half*,
+                                           int32_t*,
+                                           int32_t,
+                                           int32_t,
+                                           int32_t,
+                                           int32_t,
+                                           int32_t,
+                                           int32_t,
+                                           int32_t,
+                                           cudaStream_t);
+
+template <typename T>
+__global__ void scatter_tokens_impl(T* all_activations,
+                                    const T* layer_activations,
+                                    int32_t* gather_indices,
+                                    int32_t retained_tokens,
+                                    int32_t channels,
+                                    int32_t read_batch_stride,
+                                    int32_t read_seq_stride,
+                                    int32_t write_batch_stride,
+                                    int32_t write_seq_stride)
+{
+    constexpr int mem_vals_t = td_data::granularity / sizeof(T);
+
+    cg::thread_block tb = cg::this_thread_block();
+
+    const int gather_idx =
+        gather_indices[tb.group_index().x * retained_tokens + tb.group_index().y];
+
+    const int read_offset =
+        read_batch_stride * tb.group_index().x + read_seq_stride * tb.group_index().y;
+    const int write_offset =
+        write_batch_stride * tb.group_index().x + write_seq_stride * gather_idx;
+
+    for (int i = tb.thread_index().x * mem_vals_t; i < channels; i += mem_vals_t * blockDim.x) {
+        T local_data[mem_vals_t];
+        mem_access::load_global<td_data::granularity>(local_data,
+                                                      layer_activations + read_offset + i);
+        mem_access::store_global<td_data::granularity>(all_activations + write_offset + i,
+                                                       local_data);
+    }
+}
+
+template <typename T>
+void launch_scatter_tokens(T* all_activations,
+                           T* layer_activations,
+                           int32_t* gather_indices,
+                           int32_t batch_size,
+                           int32_t sampled_tokens,
+                           int32_t channels,
+                           int32_t read_batch_stride,
+                           int32_t read_seq_stride,
+                           int32_t write_batch_stride,
+                           int32_t write_seq_stride,
+                           cudaStream_t stream)
+{
+    constexpr int mem_vals_t = td_data::granularity / sizeof(T);
+
+    const int load_steps = (channels + mem_vals_t - 1) / mem_vals_t;
+    const int threads = (load_steps >= 1024) ? 1024 : load_steps;
+
+    dim3 block(threads);
+    dim3 grid(batch_size, sampled_tokens);
+
+    scatter_tokens_impl<T><<<grid, block, 0, stream>>>(all_activations,
+                                                       layer_activations,
+                                                       gather_indices,
+                                                       sampled_tokens,
+                                                       channels,
+                                                       read_batch_stride,
+                                                       read_seq_stride,
+                                                       write_batch_stride,
+                                                       write_seq_stride);
+}
+
+template void launch_scatter_tokens<float>(float*,
+                                           float*,
+                                           int32_t*,
+                                           int32_t,
+                                           int32_t,
+                                           int32_t,
+                                           int32_t,
+                                           int32_t,
+                                           int32_t,
+                                           int32_t,
+                                           cudaStream_t);
+
+template void launch_scatter_tokens<__half>(__half*,
+                                            __half*,
+                                            int32_t*,
+                                            int32_t,
+                                            int32_t,
+                                            int32_t,
+                                            int32_t,
+                                            int32_t,
+                                            int32_t,
+                                            int32_t,
+                                            cudaStream_t);
diff --git a/csrc/random_ltd/pt_binding.cpp b/csrc/random_ltd/pt_binding.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..54c41cab9851e11cd8ceaa7908c43dbddc69fb1d
--- /dev/null
+++ b/csrc/random_ltd/pt_binding.cpp
@@ -0,0 +1,215 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#include <torch/extension.h>
+#include <vector>
+#include "custom_cuda_layers.h"
+
+torch::Tensor token_sort_(torch::Tensor& unsorted_token_ids, int64_t original_tokens)
+{
+    const int layers = unsorted_token_ids.size(0);
+    const int batch_size = unsorted_token_ids.size(1);
+    const int reserved_tokens = unsorted_token_ids.size(2);
+
+    launch_token_sort(unsorted_token_ids.data_ptr<int32_t>(),
+                      layers,
+                      batch_size,
+                      reserved_tokens,
+                      original_tokens,
+                      c10::cuda::getCurrentCUDAStream());
+
+    return unsorted_token_ids;
+}
+
+torch::Tensor token_gather(torch::Tensor& activations,
+                           torch::Tensor& sorted_indices,
+                           bool batch_first)
+{
+    // Activations may be in either [N, S, C] or [S, N, C] while sorted_indices is
+    // always in [N, retained]
+    /*
+        TORCH_CHECK(sorted_indices.size(0) == activations.size(0) ||
+                        sorted_indices.size(0) == activations.size(1),
+                    "Unable to match the batch size of the sorted indices to the activation
+       shape."); TORCH_CHECK(activations.size(2) % 8 == 0, "Channels must be divisible by 8 to align
+       with vectorized loads.");
+    */
+    // bool batch_first = sorted_indices.size(0) == activations.size(0);
+
+    const int64_t dim_0 = (batch_first) ? sorted_indices.size(0) : sorted_indices.size(1);
+    const int64_t dim_1 = (batch_first) ? sorted_indices.size(1) : sorted_indices.size(0);
+    const int64_t dim_2 = activations.size(2);
+
+    auto output = torch::empty({dim_0, dim_1, dim_2}, activations.options());
+
+    const int batch_size = sorted_indices.size(0);
+    const int channels = dim_2;
+    const int retained_tokens = sorted_indices.size(1);
+    const int read_batch_stride = (batch_first) ? activations.stride(0) : activations.stride(1);
+    const int read_seq_stride = (batch_first) ? activations.stride(1) : activations.stride(0);
+    const int write_batch_stride = (batch_first) ? output.stride(0) : output.stride(1);
+    const int write_seq_stride = (batch_first) ? output.stride(1) : output.stride(0);
+
+    if (activations.options().dtype() == torch::kFloat) {
+        launch_gather_tokens((float*)output.data_ptr(),
+                             (float*)activations.data_ptr(),
+                             (int32_t*)sorted_indices.data_ptr(),
+                             batch_size,
+                             retained_tokens,
+                             channels,
+                             read_batch_stride,
+                             read_seq_stride,
+                             write_batch_stride,
+                             write_seq_stride,
+                             c10::cuda::getCurrentCUDAStream());
+    } else {
+        launch_gather_tokens((__half*)output.data_ptr(),
+                             (__half*)activations.data_ptr(),
+                             (int32_t*)sorted_indices.data_ptr(),
+                             batch_size,
+                             retained_tokens,
+                             channels,
+                             read_batch_stride,
+                             read_seq_stride,
+                             write_batch_stride,
+                             write_seq_stride,
+                             c10::cuda::getCurrentCUDAStream());
+    }
+
+    return output;
+}
+
+torch::Tensor token_scatter_(torch::Tensor& all_activations,
+                             torch::Tensor& layer_activations,
+                             torch::Tensor& sorted_indices,
+                             bool batch_first)
+{
+    // Activations may be in either [N, S, C] or [S, N, C] while sorted_indices is
+    // always in [N, retained]
+    /*
+        TORCH_CHECK(sorted_indices.size(0) == all_activations.size(0) ||
+                        sorted_indices.size(0) == all_activations.size(1),
+                    "Unable to match the batch size of the sorted indices to the activation
+       shape."); TORCH_CHECK(all_activations.size(2) % 8 != 0, "Channels must be divisible by 8 to
+       align with vectorized loads.");
+    */
+    // bool batch_first = sorted_indices.size(0) == all_activations.size(0);
+
+    const int batch_size = sorted_indices.size(0);
+    const int channels = all_activations.size(2);
+    const int retained_tokens = sorted_indices.size(1);
+    const int read_batch_stride = (batch_first) ? layer_activations.stride(0)
+                                                : layer_activations.stride(1);
+    const int read_seq_stride = (batch_first) ? layer_activations.stride(1)
+                                              : layer_activations.stride(0);
+    const int write_batch_stride = (batch_first) ? all_activations.stride(0)
+                                                 : all_activations.stride(1);
+    const int write_seq_stride = (batch_first) ? all_activations.stride(1)
+                                               : all_activations.stride(0);
+
+    if (all_activations.options().dtype() == torch::kFloat) {
+        launch_scatter_tokens((float*)all_activations.data_ptr(),
+                              (float*)layer_activations.data_ptr(),
+                              (int32_t*)sorted_indices.data_ptr(),
+                              batch_size,
+                              retained_tokens,
+                              channels,
+                              read_batch_stride,
+                              read_seq_stride,
+                              write_batch_stride,
+                              write_seq_stride,
+                              c10::cuda::getCurrentCUDAStream());
+    } else {
+        launch_scatter_tokens((__half*)all_activations.data_ptr(),
+                              (__half*)layer_activations.data_ptr(),
+                              (int32_t*)sorted_indices.data_ptr(),
+                              batch_size,
+                              retained_tokens,
+                              channels,
+                              read_batch_stride,
+                              read_seq_stride,
+                              write_batch_stride,
+                              write_seq_stride,
+                              c10::cuda::getCurrentCUDAStream());
+    }
+
+    return all_activations;
+}
+
+torch::Tensor mask_gather_bert(torch::Tensor& dense_mask, torch::Tensor& sorted_indices)
+{
+    // TORCH_CHECK(dense_mask.dim() == 4)
+
+    const int batch_size = dense_mask.size(0);
+    const int layers = sorted_indices.size(0);
+    /*
+        TORCH_CHECK(layers * batch_size == sorted_indices.size(0),
+                    "Mismatch between the indices and the mask");
+    */
+    const int orig_seq_len = dense_mask.size(3);
+    const int truncated_seq_len = sorted_indices.size(2);
+
+    auto output = torch::empty({layers, batch_size, 1, truncated_seq_len, truncated_seq_len},
+                               dense_mask.options());
+
+    if (dense_mask.options().dtype() == torch::kFloat) {
+        launch_slice_bert_mask((float*)output.data_ptr(),
+                               (const float*)dense_mask.data_ptr(),
+                               (const int32_t*)sorted_indices.data_ptr(),
+                               layers,
+                               batch_size,
+                               truncated_seq_len,
+                               orig_seq_len,
+                               c10::cuda::getCurrentCUDAStream());
+    } else {
+        launch_slice_bert_mask((__half*)output.data_ptr(),
+                               (const __half*)dense_mask.data_ptr(),
+                               (const int32_t*)sorted_indices.data_ptr(),
+                               layers,
+                               batch_size,
+                               truncated_seq_len,
+                               orig_seq_len,
+                               c10::cuda::getCurrentCUDAStream());
+    }
+
+    return output;
+}
+
+torch::Tensor mask_gather_gpt(torch::Tensor dense_mask, int truncated_seq_len)
+{
+    // TORCH_CHECK(dense_mask.dim() == 4)
+
+    const int batch_size = dense_mask.size(0);
+    const int orig_seq_len = dense_mask.size(3);
+
+    auto output =
+        torch::empty({batch_size, 1, truncated_seq_len, truncated_seq_len}, dense_mask.options());
+
+    if (dense_mask.options().dtype() == torch::kFloat) {
+        launch_slice_gpt_mask((float*)output.data_ptr(),
+                              (const float*)dense_mask.data_ptr(),
+                              batch_size,
+                              truncated_seq_len,
+                              orig_seq_len,
+                              c10::cuda::getCurrentCUDAStream());
+    } else {
+        launch_slice_gpt_mask((__half*)output.data_ptr(),
+                              (const __half*)dense_mask.data_ptr(),
+                              batch_size,
+                              truncated_seq_len,
+                              orig_seq_len,
+                              c10::cuda::getCurrentCUDAStream());
+    }
+
+    return output;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("token_sort_", &token_sort_, "Comparison free sorting algorithm (CUDA)");
+    m.def("token_gather", &token_gather, "Parallel gather of tokens (CUDA)");
+    m.def("token_scatter_", &token_scatter_, "Parallel scatter of tokens (CUDA)");
+    m.def("mask_gather_bert", &mask_gather_bert, "Token-based mask gather for BERT masking (CUDA)");
+    m.def("mask_gather_gpt", &mask_gather_gpt, "Token-based mask gather for GPT masking (CUDA)");
+}
diff --git a/csrc/random_ltd/slice_attn_masks.cu b/csrc/random_ltd/slice_attn_masks.cu
new file mode 100644
index 0000000000000000000000000000000000000000..63d005c4b5087641a48a305332ebe265f50a5ebf
--- /dev/null
+++ b/csrc/random_ltd/slice_attn_masks.cu
@@ -0,0 +1,127 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#include "custom_cuda_layers.h"
+#include "memory_access_utils.h"
+
+namespace cg = cooperative_groups;
+
+template <typename T>
+__global__ void slice_gpt_mask_impl(T* output_mask,
+                                    const T* input_mask,
+                                    int truncated_seq_len,
+                                    int orig_seq_len)
+{
+    const int in_batch_stride = orig_seq_len * orig_seq_len;
+    const int out_batch_stride = truncated_seq_len * truncated_seq_len;
+
+    cg::thread_block tb = cg::this_thread_block();
+
+    const T* input_mask_block =
+        input_mask + blockIdx.x * in_batch_stride + blockIdx.y * orig_seq_len;
+    T* output_mask_block =
+        output_mask + blockIdx.x * out_batch_stride + blockIdx.y * truncated_seq_len;
+
+    for (int i = tb.thread_index().x; i < truncated_seq_len; i += blockDim.x) {
+        output_mask_block[i] = input_mask_block[i];
+    }
+}
+
+template <typename T>
+void launch_slice_gpt_mask(T* output_mask,
+                           const T* input_mask,
+                           int batch_size,
+                           int truncated_seq_len,
+                           int orig_seq_len,
+                           cudaStream_t stream)
+{
+    const int threads = (truncated_seq_len >= 1024) ? 1024 : truncated_seq_len;
+
+    dim3 block(threads);
+    dim3 grid(batch_size, truncated_seq_len);
+
+    slice_gpt_mask_impl<T>
+        <<<grid, block, 0, stream>>>(output_mask, input_mask, truncated_seq_len, orig_seq_len);
+}
+
+template void launch_slice_gpt_mask<float>(float*, const float*, int, int, int, cudaStream_t);
+
+template void launch_slice_gpt_mask<__half>(__half*, const __half*, int, int, int, cudaStream_t);
+
+template <typename T>
+__global__ void slice_bert_mask_impl(T* output_mask,
+                                     const T* input_mask,
+                                     const int32_t* retained_indices,
+                                     int32_t truncated_seq_len,
+                                     int32_t orig_seq_len)
+{
+    const int in_batch_stride = orig_seq_len * orig_seq_len;
+    const int out_batch_stride = truncated_seq_len * truncated_seq_len;
+    const int out_layer_stride = out_batch_stride * gridDim.y;
+
+    cg::thread_block tb = cg::this_thread_block();
+
+    const int out_layer_offset = tb.group_index().x * out_layer_stride;
+
+    const int in_batch_offset = tb.group_index().y * in_batch_stride;
+    const int out_batch_offset = tb.group_index().y * out_batch_stride;
+
+    const int32_t gather_row =
+        retained_indices[tb.group_index().y * truncated_seq_len + tb.group_index().z];
+    const int in_seq_offset = gather_row * orig_seq_len;
+    const int out_seq_offset = tb.group_index().z * truncated_seq_len;
+
+    const T* in_sequence = input_mask + in_batch_offset + in_seq_offset;
+    T* out_sequence = output_mask + out_layer_offset + out_batch_offset + out_seq_offset;
+    const int32_t* gather_data = retained_indices + tb.group_index().y * truncated_seq_len;
+
+    for (int i = tb.thread_index().x; i < truncated_seq_len; i += blockDim.x) {
+        out_sequence[i] = in_sequence[gather_data[i]];
+    }
+}
+
+/*
+Since the Bert mask is not causal like GPT, we can't just generate a set of
+masks for the entire model based off a single layer sample.
+
+We map the kernel as follows:
+z-dimension: layer
+y-dimension: batch
+x-dimension: sequence_offset
+*/
+template <typename T>
+void launch_slice_bert_mask(T* output_mask,
+                            const T* input_mask,
+                            const int32_t* retained_indices,
+                            int32_t layers,
+                            int32_t batch_size,
+                            int32_t truncated_seq_len,
+                            int32_t orig_seq_len,
+                            cudaStream_t stream)
+{
+    const int threads = (truncated_seq_len >= 1024) ? 1024 : truncated_seq_len;
+    dim3 block(threads);
+    dim3 grid(layers, batch_size, truncated_seq_len);
+
+    slice_bert_mask_impl<T><<<grid, block, 0, stream>>>(
+        output_mask, input_mask, retained_indices, truncated_seq_len, orig_seq_len);
+}
+
+template void launch_slice_bert_mask<float>(float*,
+                                            const float*,
+                                            const int32_t*,
+                                            int32_t,
+                                            int32_t,
+                                            int32_t,
+                                            int32_t,
+                                            cudaStream_t);
+
+template void launch_slice_bert_mask<__half>(__half*,
+                                             const __half*,
+                                             const int32_t*,
+                                             int32_t,
+                                             int32_t,
+                                             int32_t,
+                                             int32_t,
+                                             cudaStream_t);
diff --git a/csrc/random_ltd/token_sort.cu b/csrc/random_ltd/token_sort.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d260211f9739a5d875dcd0f56ba050f9438e20df
--- /dev/null
+++ b/csrc/random_ltd/token_sort.cu
@@ -0,0 +1,193 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#include <cassert>
+#include "custom_cuda_layers.h"
+#include "memory_access_utils.h"
+
+namespace cg = cooperative_groups;
+
+namespace td_sort {
+constexpr int threads = 512;
+constexpr int granularity = 16;
+constexpr int mem_vals = granularity / sizeof(int32_t);
+constexpr int max_buffer_size = (threads + 1) * mem_vals;
+
+#ifdef __HIP_PLATFORM_HCC__
+constexpr int warp_size = 64;
+#else
+constexpr int warp_size = 32;
+#endif
+
+constexpr int max_warps = threads / warp_size;
+}  // namespace td_sort
+
+template <int VALS_PER_THREAD>
+__global__ void scan_sort(int32_t* data, int reserved_tokens, int original_tokens)
+{
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<td_sort::warp_size> warp = cg::tiled_partition<td_sort::warp_size>(tb);
+
+    __shared__ int32_t indices_buffer[td_sort::max_buffer_size];
+    __shared__ int32_t intermediate_buffer[td_sort::max_warps];
+    __shared__ int32_t sorted_indices_buffer[td_sort::max_buffer_size];
+
+    for (int i = tb.thread_index().x * td_sort::mem_vals; i < original_tokens + 1;
+         i += tb.group_dim().x * td_sort::mem_vals) {
+        uint32_t zeros[td_sort::mem_vals] = {0, 0, 0, 0};
+        mem_access::store_shared<td_sort::granularity>(indices_buffer + i, zeros);
+    }
+
+    int32_t local_vals[VALS_PER_THREAD];
+
+    // We flatten layers/batch into a single indexing dimension
+    int32_t* data_block = data + tb.group_index().x * reserved_tokens;
+
+    // The next two loops really could be fused for a more logical code layout, but don't want to
+    // move the barrier forward
+#pragma unroll
+    for (int i = 0; i < VALS_PER_THREAD; i++) {
+        const int iter_idx = i * td_sort::threads + tb.thread_index().x;
+        if (iter_idx < reserved_tokens) {
+            mem_access::load_global<sizeof(int32_t)>(local_vals + i, data_block + iter_idx);
+        } else {
+            local_vals[i] = 0;
+        }
+    }
+
+    tb.sync();
+
+#pragma unroll
+    for (int i = 0; i < VALS_PER_THREAD; i++) {
+        const int iter_idx = i * td_sort::threads + tb.thread_index().x;
+        if (iter_idx < reserved_tokens) {
+            const int32_t one = 1;
+            mem_access::store_shared<sizeof(int32_t)>(indices_buffer + local_vals[i], &one);
+        }
+    }
+
+    tb.sync();
+
+    int32_t local_input[td_sort::mem_vals];
+    mem_access::load_shared<td_sort::granularity>(
+        local_input, indices_buffer + tb.thread_index().x * td_sort::mem_vals);
+
+    int32_t reduce_vals[td_sort::mem_vals];
+    reduce_vals[0] = local_input[0];
+
+#pragma unroll
+    for (int i = 1; i < td_sort::mem_vals; i++) {
+        reduce_vals[i] = local_input[i] + reduce_vals[i - 1];
+    }
+
+    int32_t step_1_val = reduce_vals[td_sort::mem_vals - 1];
+    // Short span exclusive scan algorithm (less work efficient)
+#pragma unroll
+    for (int i = 1; i < td_sort::warp_size; i *= 2) {
+        int32_t step_val = warp.shfl_up(step_1_val, i);
+        step_1_val = (warp.thread_rank() < i) ? step_1_val : step_1_val + step_val;
+    }
+
+    if (warp.thread_rank() == td_sort::warp_size - 1) {
+        mem_access::store_shared<sizeof(int32_t)>(intermediate_buffer + warp.meta_group_rank(),
+                                                  &step_1_val);
+    }
+
+    tb.sync();
+
+    if (warp.meta_group_rank() == 0) {
+        int32_t step_2_val = 0;
+        if (warp.thread_rank() < td_sort::max_warps) {
+            mem_access::load_shared<sizeof(int32_t)>(&step_2_val,
+                                                     intermediate_buffer + warp.thread_rank());
+        }
+
+#pragma unroll
+        for (int i = 1; i < td_sort::warp_size; i *= 2) {
+            int32_t step_val = warp.shfl_up(step_2_val, i);
+            step_2_val = (warp.thread_rank() < i) ? step_2_val : step_2_val + step_val;
+        }
+
+        if (warp.thread_rank() < td_sort::max_warps) {
+            mem_access::store_shared<sizeof(int32_t)>(intermediate_buffer + warp.thread_rank(),
+                                                      &step_2_val);
+        }
+    }
+
+    tb.sync();
+
+    int step_2_val = 0;
+    if (warp.meta_group_rank() > 0) {
+        mem_access::load_shared<sizeof(int32_t)>(&step_2_val,
+                                                 intermediate_buffer + warp.meta_group_rank() - 1);
+    }
+
+    const int thread_offset = reduce_vals[td_sort::mem_vals - 1];
+
+#pragma unroll
+    for (int i = 0; i < td_sort::mem_vals; i++) {
+        reduce_vals[i] += step_1_val + step_2_val - thread_offset;
+    }
+    mem_access::store_shared<td_sort::granularity>(
+        indices_buffer + tb.thread_index().x * td_sort::mem_vals, reduce_vals);
+
+    if (tb.thread_index().x == 0) {
+        indices_buffer[original_tokens] = original_tokens - indices_buffer[original_tokens];
+    }
+    tb.sync();
+
+    for (int i = 0; i < VALS_PER_THREAD; i++) {
+        const int iter_idx = i * td_sort::threads + tb.thread_index().x;
+        if (iter_idx < reserved_tokens) {
+            if (local_vals[i] == 0) {
+                int zero = 0;
+                mem_access::store_shared<sizeof(int32_t)>(sorted_indices_buffer, &zero);
+            } else {
+                int sorted_idx;
+                mem_access::load_shared<sizeof(int32_t)>(&sorted_idx,
+                                                         indices_buffer + local_vals[i] - 1);
+                mem_access::store_shared<sizeof(int32_t)>(sorted_indices_buffer + sorted_idx,
+                                                          local_vals + i);
+            }
+        }
+    }
+
+    tb.sync();
+
+#pragma unroll
+    for (int i = 0; i < VALS_PER_THREAD; i++) {
+        const int iter_idx = i * td_sort::threads + tb.thread_index().x;
+        if (iter_idx < reserved_tokens) {
+            int32_t store_val;
+            mem_access::load_shared<sizeof(int32_t)>(&store_val, sorted_indices_buffer + iter_idx);
+            mem_access::store_global<sizeof(int32_t)>(data_block + iter_idx, &store_val);
+        }
+    }
+}
+
+void launch_token_sort(int32_t* indices,
+                       int layers,
+                       int batch_size,
+                       int reserved_size,
+                       int original_tokens,
+                       cudaStream_t stream)
+{
+    // Each sort is completely independent, can flatten this dimension
+    dim3 grid(layers * batch_size);
+    dim3 block(td_sort::threads);
+
+    const int vals_per_thread = (reserved_size + td_sort::threads - 1) / td_sort::threads;
+
+    if (vals_per_thread == 1) {
+        scan_sort<1><<<grid, block, 0, stream>>>(indices, reserved_size, original_tokens);
+    } else if (vals_per_thread == 2) {
+        scan_sort<2><<<grid, block, 0, stream>>>(indices, reserved_size, original_tokens);
+    } else if (vals_per_thread == 3) {
+        scan_sort<3><<<grid, block, 0, stream>>>(indices, reserved_size, original_tokens);
+    } else if (vals_per_thread == 4) {
+        scan_sort<4><<<grid, block, 0, stream>>>(indices, reserved_size, original_tokens);
+    } else {
+        assert(false);
+    }
+}
diff --git a/csrc/spatial/csrc/opt_bias_add.cu b/csrc/spatial/csrc/opt_bias_add.cu
new file mode 100644
index 0000000000000000000000000000000000000000..dfcb92facbf7e5e1769a2d734cf6770ab36c5ede
--- /dev/null
+++ b/csrc/spatial/csrc/opt_bias_add.cu
@@ -0,0 +1,148 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#include <cassert>
+#include "memory_access_utils.h"
+#include "spatial_cuda_layers.h"
+
+/*
+Fused bias add variants
+*/
+
+namespace badd_opt {
+constexpr int threads = 256;
+constexpr int steps = 2;
+constexpr int granularity = 16;
+constexpr int vals_per_h = granularity / sizeof(__half);
+constexpr int vals_per_h2 = granularity / sizeof(__half2);
+constexpr int vals_per_block = threads * steps * vals_per_h;
+constexpr int stride = vals_per_h * threads;
+}  // namespace badd_opt
+
+__global__ void opt_bias_add(__half* result,
+                             const __half* activation,
+                             const __half* bias,
+                             int seq_len,
+                             int channels)
+{
+    const int id = blockIdx.x * badd_opt::vals_per_block + threadIdx.x * badd_opt::vals_per_h;
+    const int stride = badd_opt::vals_per_h * badd_opt::threads;
+
+    for (int i = 0; i < badd_opt::steps; i++) {
+        if (id + i * badd_opt::stride < seq_len * channels) {
+            __half2 act_buffer[badd_opt::vals_per_h2];
+            __half2 bias_buffer[badd_opt::vals_per_h2];
+
+            mem_access::load_global<badd_opt::granularity>(act_buffer,
+                                                           activation + id + i * stride);
+            mem_access::load_global<badd_opt::granularity>(bias_buffer,
+                                                           bias + ((id + i * stride) % channels));
+
+            for (int j = 0; j < badd_opt::vals_per_h2; j++) { act_buffer[j] += bias_buffer[j]; }
+
+            mem_access::store_global<badd_opt::granularity>(result + id + i * stride, act_buffer);
+        }
+    }
+}
+
+__global__ void opt_bias_add_add(__half* result,
+                                 const __half* activation,
+                                 const __half* bias,
+                                 const __half* other,
+                                 int seq_len,
+                                 int channels)
+{
+    const int id = blockIdx.x * badd_opt::vals_per_block + threadIdx.x * badd_opt::vals_per_h;
+    const int stride = badd_opt::vals_per_h * badd_opt::threads;
+
+    for (int i = 0; i < badd_opt::steps; i++) {
+        if (id + i * badd_opt::stride < seq_len * channels) {
+            __half2 act_buffer[badd_opt::vals_per_h2];
+            __half2 bias_buffer[badd_opt::vals_per_h2];
+            __half2 other_buffer[badd_opt::vals_per_h2];
+
+            mem_access::load_global<badd_opt::granularity>(act_buffer,
+                                                           activation + id + i * stride);
+            mem_access::load_global<badd_opt::granularity>(bias_buffer,
+                                                           bias + ((id + i * stride) % channels));
+            mem_access::load_global<badd_opt::granularity>(other_buffer, other + id + i * stride);
+
+            for (int j = 0; j < badd_opt::vals_per_h2; j++) {
+                act_buffer[j] += bias_buffer[j] + other_buffer[j];
+            }
+
+            mem_access::store_global<badd_opt::granularity>(result + id + i * stride, act_buffer);
+        }
+    }
+}
+
+__global__ void opt_bias_add_bias_add(__half* result,
+                                      const __half* activation,
+                                      const __half* bias,
+                                      const __half* other,
+                                      const __half* other_bias,
+                                      int seq_len,
+                                      int channels)
+{
+    const int id = blockIdx.x * badd_opt::vals_per_block + threadIdx.x * badd_opt::vals_per_h;
+    const int stride = badd_opt::vals_per_h * badd_opt::threads;
+
+    for (int i = 0; i < badd_opt::steps; i++) {
+        if (id + i * badd_opt::stride < seq_len * channels) {
+            __half2 act_buffer[badd_opt::vals_per_h2];
+            __half2 bias_buffer[badd_opt::vals_per_h2];
+            __half2 other_buffer[badd_opt::vals_per_h2];
+            __half2 other_bias_buffer[badd_opt::vals_per_h2];
+
+            mem_access::load_global<badd_opt::granularity>(act_buffer,
+                                                           activation + id + i * stride);
+            mem_access::load_global<badd_opt::granularity>(bias_buffer,
+                                                           bias + ((id + i * stride) % channels));
+            mem_access::load_global<badd_opt::granularity>(other_buffer, other + id + i * stride);
+            mem_access::load_global<badd_opt::granularity>(
+                other_bias_buffer, other_bias + ((id + i * stride) % channels));
+
+            for (int j = 0; j < badd_opt::vals_per_h2; j++) {
+                act_buffer[j] =
+                    (act_buffer[j] + bias_buffer[j]) + (other_buffer[j] + other_bias_buffer[j]);
+            }
+
+            mem_access::store_global<badd_opt::granularity>(result + id + i * stride, act_buffer);
+        }
+    }
+}
+
+void launch_opt_bias_add(__half* result,
+                         const __half* activation,
+                         const __half* bias,
+                         const __half* other,
+                         const __half* other_bias,
+                         int batch_size,
+                         int seq_len,
+                         int channels,
+                         cudaStream_t stream)
+{
+    // Should evaluate `true` for reasonable hidden sizes
+    assert(channels % badd_opt::vals_per_h == 0);
+
+    const int effective_seq_len = batch_size * seq_len;
+    const int vals = effective_seq_len * channels;
+
+    dim3 block(badd_opt::threads);
+    dim3 grid((vals + badd_opt::vals_per_block - 1) / badd_opt::vals_per_block);
+
+    if (!other) {
+        // We shouldn't have a bias if there's no activation
+        assert(!other_bias);
+
+        opt_bias_add<<<grid, block, 0, stream>>>(
+            result, activation, bias, effective_seq_len, channels);
+    } else if (!other_bias) {
+        opt_bias_add_add<<<grid, block, 0, stream>>>(
+            result, activation, bias, other, effective_seq_len, channels);
+    } else {
+        opt_bias_add_bias_add<<<grid, block, 0, stream>>>(
+            result, activation, bias, other, other_bias, effective_seq_len, channels);
+    }
+}
diff --git a/csrc/spatial/csrc/pt_binding.cpp b/csrc/spatial/csrc/pt_binding.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3234b063c241ebc6b3448b4be4b62e689c8f8625
--- /dev/null
+++ b/csrc/spatial/csrc/pt_binding.cpp
@@ -0,0 +1,111 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+#include <cstdio>
+#include <vector>
+#include "spatial_cuda_layers.h"
+
+ChannelsLastProblem dimension_problem(at::Tensor& input)
+{
+    ChannelsLastProblem dims;
+
+    if (input.dim() == 4) {
+        // In some sense this is unsafe (and a reflection of the assumptions made inside
+        // the C10 options checker). Basically, there's no great way to be sure that
+        // a tensor is in channels last because a 1x1 image will appear to be in channels
+        // last even when it isn't.
+        assert(input.is_contiguous(at::MemoryFormat::ChannelsLast));
+        dims.batch_size = input.size(0);
+        dims.seq_len = input.size(2) * input.size(3);
+        dims.channels = input.size(1);
+    } else {
+        assert(input.is_contiguous());
+        dims.batch_size = input.size(0);
+        dims.seq_len = input.size(1);
+        dims.channels = input.size(2);
+    }
+
+    return dims;
+}
+
+at::Tensor seq_unroll_bias_add(at::Tensor& input, at::Tensor& bias)
+{
+    assert(input.dtype() == at::kHalf);
+
+    // TODO(cmikeh2): Should probably refactor this into a more portable
+    // description, since it does generalize for channels-last
+    ChannelsLastProblem problem = dimension_problem(input);
+
+    auto output = at::empty_like(input);
+
+    launch_opt_bias_add((__half*)output.data_ptr(),
+                        (const __half*)input.data_ptr(),
+                        (const __half*)bias.data_ptr(),
+                        nullptr,
+                        nullptr,
+                        problem.batch_size,
+                        problem.seq_len,
+                        problem.channels,
+                        at::cuda::getCurrentCUDAStream());
+
+    return output;
+}
+
+at::Tensor seq_bias_add_add(at::Tensor& input, at::Tensor& bias, at::Tensor& other)
+{
+    assert(input.dtype() == at::kHalf);
+
+    // TODO(cmikeh2): Should probably refactor this into a more portable
+    // description, since it does generalize for channels-last
+    ChannelsLastProblem problem = dimension_problem(input);
+
+    auto output = at::empty_like(input);
+
+    launch_opt_bias_add((__half*)output.data_ptr(),
+                        (const __half*)input.data_ptr(),
+                        (const __half*)bias.data_ptr(),
+                        (const __half*)other.data_ptr(),
+                        nullptr,
+                        problem.batch_size,
+                        problem.seq_len,
+                        problem.channels,
+                        at::cuda::getCurrentCUDAStream());
+
+    return output;
+}
+
+at::Tensor seq_bias_add_bias_add(at::Tensor& input,
+                                 at::Tensor& bias,
+                                 at::Tensor& other,
+                                 at::Tensor& other_bias)
+{
+    assert(input.dtype() == at::kHalf);
+
+    // TODO(cmikeh2): Should probably refactor this into a more portable
+    // description, since it does generalize for channels-last
+    ChannelsLastProblem problem = dimension_problem(input);
+
+    auto output = at::empty_like(input);
+
+    launch_opt_bias_add((__half*)output.data_ptr(),
+                        (const __half*)input.data_ptr(),
+                        (const __half*)bias.data_ptr(),
+                        (const __half*)other.data_ptr(),
+                        (const __half*)other_bias.data_ptr(),
+                        problem.batch_size,
+                        problem.seq_len,
+                        problem.channels,
+                        at::cuda::getCurrentCUDAStream());
+
+    return output;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("nhwc_bias_add", &seq_unroll_bias_add);
+    m.def("nhwc_bias_add_add", &seq_bias_add_add);
+    m.def("nhwc_bias_add_bias_add", &seq_bias_add_bias_add);
+}
diff --git a/csrc/spatial/includes/spatial_cuda_layers.h b/csrc/spatial/includes/spatial_cuda_layers.h
new file mode 100644
index 0000000000000000000000000000000000000000..9948153f56d659c1c5da0f4550ffdf057e826a70
--- /dev/null
+++ b/csrc/spatial/includes/spatial_cuda_layers.h
@@ -0,0 +1,36 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#pragma once
+
+#if __CUDA_ARCH__ >= 530
+#define HALF_PRECISION_AVAILABLE = 1
+#endif
+
+#ifdef __HIPCC__
+#include <hip/hip_cooperative_groups.h>
+#else
+#include <cooperative_groups.h>
+#endif 
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+
+/*********** Group Norm Kernels, Structs, and Helpers ************/
+
+struct {
+    int64_t batch_size;
+    int64_t seq_len;
+    int64_t channels;
+} typedef ChannelsLastProblem;
+
+void launch_opt_bias_add(__half* result,
+                         const __half* activation,
+                         const __half* bias,
+                         const __half* other,
+                         const __half* other_bias,
+                         int batch_size,
+                         int seq_len,
+                         int channels,
+                         cudaStream_t stream);
diff --git a/csrc/transformer/cublas_wrappers.cu b/csrc/transformer/cublas_wrappers.cu
index 75ecd3fb4ef9d5d63d9c7681bdce0cf949641b5d..97ecdd6f13ce7a5dedd7cecd14c5a23e90a22dea 100644
--- a/csrc/transformer/cublas_wrappers.cu
+++ b/csrc/transformer/cublas_wrappers.cu
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #include "cublas_wrappers.h"
 
 #ifdef __HIP_PLATFORM_HCC__
diff --git a/csrc/transformer/dropout_kernels.cu b/csrc/transformer/dropout_kernels.cu
old mode 100644
new mode 100755
index d1ba135f4900f8eff3f6b4cab70d1b35b39f7833..b2f2abc3f09875a51662292f8e195c53da3e2fa3
--- a/csrc/transformer/dropout_kernels.cu
+++ b/csrc/transformer/dropout_kernels.cu
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #include "custom_cuda_layers.h"
 
 const int unroll_factor = 4;
diff --git a/csrc/transformer/ds_transformer_cuda.cpp b/csrc/transformer/ds_transformer_cuda.cpp
index 0e86322291f76573730b56fb25fb5e89f3d09ddd..d2a0f6e9d82a7085d22307439821c9f7e0329bf1 100644
--- a/csrc/transformer/ds_transformer_cuda.cpp
+++ b/csrc/transformer/ds_transformer_cuda.cpp
@@ -113,7 +113,6 @@ BertTransformerLayer<T>::BertTransformerLayer(unsigned layer_id,
                                                         _seq_length,
                                                         _hidden_size / _heads,
                                                         //(T(1.0) / T(sqrt(_hidden_size / _heads))),
-                                                        //aiss debug 0506
                                                         (T(1.0 / (sqrt(_hidden_size / _heads)))),
                                                         T(0.0),
                                                         CUBLAS_OP_T,
diff --git a/csrc/transformer/gelu_kernels.cu b/csrc/transformer/gelu_kernels.cu
index d683cf0af83daf829e390d83182eb6f0ffd040a6..1f113a988d74e3d772d8268cbd7882ba087dba03 100644
--- a/csrc/transformer/gelu_kernels.cu
+++ b/csrc/transformer/gelu_kernels.cu
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #include "custom_cuda_layers.h"
 
 inline __device__ float gelu(const float x)
diff --git a/csrc/transformer/general_kernels.cu b/csrc/transformer/general_kernels.cu
index 1eaa94e1e71a40bf44b661656395cf1ed087f589..ea5491003dc95d8cfcaa80804fdd8ce23b9303ff 100644
--- a/csrc/transformer/general_kernels.cu
+++ b/csrc/transformer/general_kernels.cu
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #include "general_kernels.h"
 
 namespace cg = cooperative_groups;
diff --git a/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu b/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu
index 175854b8860b92e68485342a21bc7d636d58065c..0be46353c6543a4a87c3657fef4ef60322f91481 100644
--- a/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu
+++ b/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu
@@ -1,9 +1,14 @@
-#include "custom_cuda_layers.h"
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#include "inference_cuda_layers.h"
 
 #ifndef __HIP_PLATFORM_HCC__
 #include <cuda_profiler_api.h>
 #endif
 
+namespace cg = cooperative_groups;
 namespace cg = cooperative_groups;
 
 __global__ void apply_rotary_pos_emb(float* mixed_query,
@@ -13,7 +18,8 @@ __global__ void apply_rotary_pos_emb(float* mixed_query,
                                      unsigned seq_offset,
                                      unsigned num_heads,
                                      unsigned head_size,
-                                     unsigned total_count)
+                                     unsigned total_count,
+                                     int max_out_tokens)
 {
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
@@ -26,13 +32,15 @@ __global__ void apply_rotary_pos_emb(float* mixed_query,
     unsigned offset = head_id * head_size;
 
     unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
+    unsigned seq_index = head_id % seq_len;
+    unsigned k_offset = (seq_index + (head_id / seq_len) * max_out_tokens) * head_size;
 
     if (head_id < total_count) {
         while (lane < rotary_dim) {
             float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
             inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
             float q = mixed_query[offset + lane];
-            float k = key_layer[offset + lane];
+            float k = key_layer[k_offset + lane];
             float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
             float q_rot = (q * rotary_sign);
             float k_rot = (k * rotary_sign);
@@ -42,7 +50,7 @@ __global__ void apply_rotary_pos_emb(float* mixed_query,
             k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
 
             mixed_query[offset + lane] = q;
-            key_layer[offset + lane] = k;
+            key_layer[k_offset + lane] = k;
 
             lane += WARP_SIZE;
         }
@@ -56,9 +64,9 @@ __global__ void apply_rotary_pos_emb(__half* mixed_query,
                                      unsigned seq_offset,
                                      unsigned num_heads,
                                      unsigned head_size,
-                                     unsigned total_count)
+                                     unsigned total_count,
+                                     int max_out_tokens)
 {
-#if __CUDA_ARCH__ >= 700
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
@@ -70,13 +78,15 @@ __global__ void apply_rotary_pos_emb(__half* mixed_query,
     unsigned offset = head_id * head_size;
 
     unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
+    unsigned seq_index = head_id % seq_len;
+    unsigned k_offset = (seq_index + (head_id / seq_len) * max_out_tokens) * head_size;
 
     if (head_id < total_count) {
         while (lane < rotary_dim) {
             float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
             inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
             float q = (float)mixed_query[offset + lane];
-            float k = (float)key_layer[offset + lane];
+            float k = (float)key_layer[k_offset + lane];
             float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
             float q_rot = (q * rotary_sign);
             float k_rot = (k * rotary_sign);
@@ -86,12 +96,11 @@ __global__ void apply_rotary_pos_emb(__half* mixed_query,
             k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
 
             mixed_query[offset + lane] = (__half)q;
-            key_layer[offset + lane] = (__half)k;
+            key_layer[k_offset + lane] = (__half)k;
 
             lane += WARP_SIZE;
         }
     }
-#endif
 }
 __global__ void apply_rotary_pos_emb1(float* mixed_query,
                                       float* key_layer,
@@ -100,7 +109,8 @@ __global__ void apply_rotary_pos_emb1(float* mixed_query,
                                       unsigned seq_offset,
                                       unsigned num_heads,
                                       unsigned head_size,
-                                      unsigned total_count)
+                                      unsigned total_count,
+                                      int max_out_tokens)
 {
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
@@ -113,13 +123,15 @@ __global__ void apply_rotary_pos_emb1(float* mixed_query,
     unsigned offset = head_id * head_size;
 
     unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
+    unsigned seq_index = head_id % seq_len;
+    unsigned k_offset = (seq_index + (head_id / seq_len) * max_out_tokens) * head_size;
 
     if (head_id < total_count) {
         while (lane < rotary_dim) {
             float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
             inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
             float q = mixed_query[offset + lane];
-            float k = key_layer[offset + lane];
+            float k = key_layer[k_offset + lane];
             float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
             float q_rot = (q * rotary_sign);
             float k_rot = (k * rotary_sign);
@@ -129,7 +141,7 @@ __global__ void apply_rotary_pos_emb1(float* mixed_query,
             k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
 
             mixed_query[offset + lane] = q;
-            key_layer[offset + lane] = k;
+            key_layer[k_offset + lane] = k;
 
             lane += WARP_SIZE;
         }
@@ -142,9 +154,9 @@ __global__ void apply_rotary_pos_emb1(__half* mixed_query,
                                       unsigned seq_offset,
                                       unsigned num_heads,
                                       unsigned head_size,
-                                      unsigned total_count)
+                                      unsigned total_count,
+                                      int max_out_tokens)
 {
-#if __CUDA_ARCH__ >= 700
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
@@ -153,7 +165,9 @@ __global__ void apply_rotary_pos_emb1(__half* mixed_query,
     int lane = id & 0x1f;
 
     unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
+    unsigned seq_index = head_id % seq_len;
     unsigned offset = head_id * head_size;
+    unsigned k_offset = (seq_index + (head_id / seq_len) * max_out_tokens) * head_size;
 
     constexpr unsigned mask[32] = {
         0x1 | 0x1000,     0x2 | 0x2000,     0x4 | 0x4000,     0x8 | 0x8000,     0x10 | 0x10000,
@@ -164,14 +178,14 @@ __global__ void apply_rotary_pos_emb1(__half* mixed_query,
         0x2000000,        0x4000000,        0x8000000,        0x10000000,       0x20000000,
         0x40000000,       0x80000000};
 
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
+    unsigned seq_id = (head_id % seq_len) + seq_offset;
     unsigned half_dim = rotary_dim >> 1;
     if (head_id < total_count) {
         while (lane < rotary_dim) {
             float inv_freq = (float)((lane % half_dim) * 2) / (float)rotary_dim;
             inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
             float q = (float)mixed_query[offset + lane];
-            float k = (float)key_layer[offset + lane];
+            float k = (float)key_layer[k_offset + lane];
             float rotary_sign = (lane > (half_dim - 1) ? -1.0 : 1.0);
             float q_rot = (q * rotary_sign);
             float k_rot = (k * rotary_sign);
@@ -183,12 +197,11 @@ __global__ void apply_rotary_pos_emb1(__half* mixed_query,
             k = k * cosf(inv_freq) + k_rot_tmp * sinf(inv_freq);
 
             mixed_query[offset + lane] = (__half)q;
-            key_layer[offset + lane] = (__half)k;
+            key_layer[k_offset + lane] = (__half)k;
 
             lane += WARP_SIZE;
         }
     }
-#endif
 }
 
 template <typename T>
@@ -202,17 +215,32 @@ void launch_apply_rotary_pos_emb(T* mixed_query,
                                  unsigned batch,
                                  bool rotate_half,
                                  bool rotate_every_two,
-                                 cudaStream_t stream)
+                                 cudaStream_t stream,
+                                 int max_out_tokens)
 {
     int total_count = batch * num_heads * seq_len;
     dim3 block_dims(1024);
     dim3 grid_dims((total_count - 1) / MAX_WARP_NUM + 1);  // (batch_size);
     if (rotate_every_two)
-        apply_rotary_pos_emb<<<grid_dims, block_dims, 0, stream>>>(
-            mixed_query, key_layer, rotary_dim, seq_len, offset, num_heads, head_size, total_count);
+        apply_rotary_pos_emb<<<grid_dims, block_dims, 0, stream>>>(mixed_query,
+                                                                   key_layer,
+                                                                   rotary_dim,
+                                                                   seq_len,
+                                                                   offset,
+                                                                   num_heads,
+                                                                   head_size,
+                                                                   total_count,
+                                                                   max_out_tokens);
     else if (rotate_half)
-        apply_rotary_pos_emb1<<<grid_dims, block_dims, 0, stream>>>(
-            mixed_query, key_layer, rotary_dim, seq_len, offset, num_heads, head_size, total_count);
+        apply_rotary_pos_emb1<<<grid_dims, block_dims, 0, stream>>>(mixed_query,
+                                                                    key_layer,
+                                                                    rotary_dim,
+                                                                    seq_len,
+                                                                    offset,
+                                                                    num_heads,
+                                                                    head_size,
+                                                                    total_count,
+                                                                    max_out_tokens);
 }
 
 template void launch_apply_rotary_pos_emb<float>(float*,
@@ -225,7 +253,8 @@ template void launch_apply_rotary_pos_emb<float>(float*,
                                                  unsigned,
                                                  bool,
                                                  bool,
-                                                 cudaStream_t);
+                                                 cudaStream_t,
+                                                 int);
 template void launch_apply_rotary_pos_emb<__half>(__half*,
                                                   __half*,
                                                   unsigned,
@@ -236,7 +265,9 @@ template void launch_apply_rotary_pos_emb<__half>(__half*,
                                                   unsigned,
                                                   bool,
                                                   bool,
-                                                  cudaStream_t);
+                                                  cudaStream_t,
+                                                  int);
+
 /*
 __global__ void apply_rotary_pos_emb(float* mixed_query,
 float* key_layer,
diff --git a/csrc/transformer/inference/csrc/apply_rotary_pos_emb.hip b/csrc/transformer/inference/csrc/apply_rotary_pos_emb.hip
index 4e04f7aeb4c80be79c2fe6d8b91a4cc2fecde823..a160554e6c9d4872d1fe1c3669d0015b530d0662 100644
--- a/csrc/transformer/inference/csrc/apply_rotary_pos_emb.hip
+++ b/csrc/transformer/inference/csrc/apply_rotary_pos_emb.hip
@@ -1,11 +1,16 @@
 // !!! This is a file automatically generated by hipify!!!
 #include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#include "inference_cuda_layers.h"
 
 #ifndef __HIP_PLATFORM_HCC__
 #include <cuda_profiler_api.h>
 #endif
 
+namespace cg = cooperative_groups;
 namespace cg = cooperative_groups;
 
 __global__ void apply_rotary_pos_emb(float* mixed_query,
@@ -15,7 +20,8 @@ __global__ void apply_rotary_pos_emb(float* mixed_query,
                                      unsigned seq_offset,
                                      unsigned num_heads,
                                      unsigned head_size,
-                                     unsigned total_count)
+                                     unsigned total_count,
+                                     int max_out_tokens)
 {
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
@@ -28,13 +34,15 @@ __global__ void apply_rotary_pos_emb(float* mixed_query,
     unsigned offset = head_id * head_size;
 
     unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
+    unsigned seq_index = head_id % seq_len;
+    unsigned k_offset = (seq_index + (head_id / seq_len) * max_out_tokens) * head_size;
 
     if (head_id < total_count) {
         while (lane < rotary_dim) {
             float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
             inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
             float q = mixed_query[offset + lane];
-            float k = key_layer[offset + lane];
+            float k = key_layer[k_offset + lane];
             float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
             float q_rot = (q * rotary_sign);
             float k_rot = (k * rotary_sign);
@@ -44,7 +52,7 @@ __global__ void apply_rotary_pos_emb(float* mixed_query,
             k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
 
             mixed_query[offset + lane] = q;
-            key_layer[offset + lane] = k;
+            key_layer[k_offset + lane] = k;
 
             lane += WARP_SIZE;
         }
@@ -58,9 +66,9 @@ __global__ void apply_rotary_pos_emb(__half* mixed_query,
                                      unsigned seq_offset,
                                      unsigned num_heads,
                                      unsigned head_size,
-                                     unsigned total_count)
+                                     unsigned total_count,
+                                     int max_out_tokens)
 {
-#if __CUDA_ARCH__ >= 700
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
@@ -72,13 +80,15 @@ __global__ void apply_rotary_pos_emb(__half* mixed_query,
     unsigned offset = head_id * head_size;
 
     unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
+    unsigned seq_index = head_id % seq_len;
+    unsigned k_offset = (seq_index + (head_id / seq_len) * max_out_tokens) * head_size;
 
     if (head_id < total_count) {
         while (lane < rotary_dim) {
             float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
             inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
             float q = (float)mixed_query[offset + lane];
-            float k = (float)key_layer[offset + lane];
+            float k = (float)key_layer[k_offset + lane];
             float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
             float q_rot = (q * rotary_sign);
             float k_rot = (k * rotary_sign);
@@ -88,12 +98,11 @@ __global__ void apply_rotary_pos_emb(__half* mixed_query,
             k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
 
             mixed_query[offset + lane] = (__half)q;
-            key_layer[offset + lane] = (__half)k;
+            key_layer[k_offset + lane] = (__half)k;
 
             lane += WARP_SIZE;
         }
     }
-#endif
 }
 __global__ void apply_rotary_pos_emb1(float* mixed_query,
                                       float* key_layer,
@@ -102,7 +111,8 @@ __global__ void apply_rotary_pos_emb1(float* mixed_query,
                                       unsigned seq_offset,
                                       unsigned num_heads,
                                       unsigned head_size,
-                                      unsigned total_count)
+                                      unsigned total_count,
+                                      int max_out_tokens)
 {
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
@@ -115,13 +125,15 @@ __global__ void apply_rotary_pos_emb1(float* mixed_query,
     unsigned offset = head_id * head_size;
 
     unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
+    unsigned seq_index = head_id % seq_len;
+    unsigned k_offset = (seq_index + (head_id / seq_len) * max_out_tokens) * head_size;
 
     if (head_id < total_count) {
         while (lane < rotary_dim) {
             float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
             inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
             float q = mixed_query[offset + lane];
-            float k = key_layer[offset + lane];
+            float k = key_layer[k_offset + lane];
             float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
             float q_rot = (q * rotary_sign);
             float k_rot = (k * rotary_sign);
@@ -131,7 +143,7 @@ __global__ void apply_rotary_pos_emb1(float* mixed_query,
             k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
 
             mixed_query[offset + lane] = q;
-            key_layer[offset + lane] = k;
+            key_layer[k_offset + lane] = k;
 
             lane += WARP_SIZE;
         }
@@ -144,9 +156,9 @@ __global__ void apply_rotary_pos_emb1(__half* mixed_query,
                                       unsigned seq_offset,
                                       unsigned num_heads,
                                       unsigned head_size,
-                                      unsigned total_count)
+                                      unsigned total_count,
+                                      int max_out_tokens)
 {
-#if __CUDA_ARCH__ >= 700
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
@@ -155,7 +167,9 @@ __global__ void apply_rotary_pos_emb1(__half* mixed_query,
     int lane = id & 0x1f;
 
     unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
+    unsigned seq_index = head_id % seq_len;
     unsigned offset = head_id * head_size;
+    unsigned k_offset = (seq_index + (head_id / seq_len) * max_out_tokens) * head_size;
 
     constexpr unsigned mask[32] = {
         0x1 | 0x1000,     0x2 | 0x2000,     0x4 | 0x4000,     0x8 | 0x8000,     0x10 | 0x10000,
@@ -166,14 +180,14 @@ __global__ void apply_rotary_pos_emb1(__half* mixed_query,
         0x2000000,        0x4000000,        0x8000000,        0x10000000,       0x20000000,
         0x40000000,       0x80000000};
 
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
+    unsigned seq_id = (head_id % seq_len) + seq_offset;
     unsigned half_dim = rotary_dim >> 1;
     if (head_id < total_count) {
         while (lane < rotary_dim) {
             float inv_freq = (float)((lane % half_dim) * 2) / (float)rotary_dim;
             inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
             float q = (float)mixed_query[offset + lane];
-            float k = (float)key_layer[offset + lane];
+            float k = (float)key_layer[k_offset + lane];
             float rotary_sign = (lane > (half_dim - 1) ? -1.0 : 1.0);
             float q_rot = (q * rotary_sign);
             float k_rot = (k * rotary_sign);
@@ -185,12 +199,11 @@ __global__ void apply_rotary_pos_emb1(__half* mixed_query,
             k = k * cosf(inv_freq) + k_rot_tmp * sinf(inv_freq);
 
             mixed_query[offset + lane] = (__half)q;
-            key_layer[offset + lane] = (__half)k;
+            key_layer[k_offset + lane] = (__half)k;
 
             lane += WARP_SIZE;
         }
     }
-#endif
 }
 
 template <typename T>
@@ -204,17 +217,32 @@ void launch_apply_rotary_pos_emb(T* mixed_query,
                                  unsigned batch,
                                  bool rotate_half,
                                  bool rotate_every_two,
-                                 hipStream_t stream)
+                                 hipStream_t stream,
+                                 int max_out_tokens)
 {
     int total_count = batch * num_heads * seq_len;
     dim3 block_dims(1024);
     dim3 grid_dims((total_count - 1) / MAX_WARP_NUM + 1);  // (batch_size);
     if (rotate_every_two)
-       hipLaunchKernelGGL(( apply_rotary_pos_emb), dim3(grid_dims), dim3(block_dims), 0, stream, 
-            mixed_query, key_layer, rotary_dim, seq_len, offset, num_heads, head_size, total_count);
+       hipLaunchKernelGGL(( apply_rotary_pos_emb), dim3(grid_dims), dim3(block_dims), 0, stream, mixed_query,
+                                                                   key_layer,
+                                                                   rotary_dim,
+                                                                   seq_len,
+                                                                   offset,
+                                                                   num_heads,
+                                                                   head_size,
+                                                                   total_count,
+                                                                   max_out_tokens);
     else if (rotate_half)
-       hipLaunchKernelGGL(( apply_rotary_pos_emb1), dim3(grid_dims), dim3(block_dims), 0, stream, 
-            mixed_query, key_layer, rotary_dim, seq_len, offset, num_heads, head_size, total_count);
+       hipLaunchKernelGGL(( apply_rotary_pos_emb1), dim3(grid_dims), dim3(block_dims), 0, stream, mixed_query,
+                                                                    key_layer,
+                                                                    rotary_dim,
+                                                                    seq_len,
+                                                                    offset,
+                                                                    num_heads,
+                                                                    head_size,
+                                                                    total_count,
+                                                                    max_out_tokens);
 }
 
 template void launch_apply_rotary_pos_emb<float>(float*,
@@ -227,7 +255,8 @@ template void launch_apply_rotary_pos_emb<float>(float*,
                                                  unsigned,
                                                  bool,
                                                  bool,
-                                                 hipStream_t);
+                                                 hipStream_t,
+                                                 int);
 template void launch_apply_rotary_pos_emb<__half>(__half*,
                                                   __half*,
                                                   unsigned,
@@ -238,7 +267,9 @@ template void launch_apply_rotary_pos_emb<__half>(__half*,
                                                   unsigned,
                                                   bool,
                                                   bool,
-                                                  hipStream_t);
+                                                  hipStream_t,
+                                                  int);
+
 /*
 __global__ void apply_rotary_pos_emb(float* mixed_query,
 float* key_layer,
diff --git a/csrc/transformer/inference/csrc/dequantize.cu b/csrc/transformer/inference/csrc/dequantize.cu
index 4ddaabda3eb70b1b958b1fc4c2f959867828d1a2..33605e1f54e0d09e7948d172c6539f60d8424016 100644
--- a/csrc/transformer/inference/csrc/dequantize.cu
+++ b/csrc/transformer/inference/csrc/dequantize.cu
@@ -1,4 +1,8 @@
-#include "custom_cuda_layers.h"
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#include "inference_cuda_layers.h"
 
 #define MAX_QUANTIZE_GROUPING 1024
 
@@ -46,8 +50,6 @@ __global__ void dequantize_kernel(__half* output,
                                   unsigned groups,
                                   unsigned merge_count)
 {
-#ifdef HALF_PRECISION_AVAILABLE
-
     unsigned merge_hidden = hidden_dim >> merge_count;
     unsigned quantization_stride = (merge_hidden * output_size) / groups;
 
@@ -71,7 +73,6 @@ __global__ void dequantize_kernel(__half* output,
         output[q_index] = __float2half(scale_data * (float)q);
         tid += blockDim.x;
     }
-#endif
 }
 
 template <typename T>
@@ -108,3 +109,86 @@ template void launch_dequantize<__half>(__half*,
                                         unsigned,
                                         unsigned,
                                         cudaStream_t);
+
+__global__ void dequantize_kernel(float* output,
+                                  const int8_t* input,
+                                  const float* qscale,
+                                  int hidden_dim,
+                                  unsigned merge_hidden,
+                                  int cnt)
+{
+}
+
+__global__ void dequantize_kernel(__half* output,
+                                  const int8_t* input,
+                                  const float* qscale,
+                                  unsigned hidden_dim,
+                                  unsigned merge_hidden,
+                                  int cnt)
+{
+    unsigned bid = blockIdx.x * gridDim.y + blockIdx.y;
+    unsigned tid = threadIdx.x;
+
+    float local_scale = qscale[blockIdx.x];
+
+    const float* input_cast = reinterpret_cast<const float*>(input);
+    float2* output_cast = reinterpret_cast<float2*>(output);
+
+    input_cast += bid * merge_hidden;
+    output_cast += bid * merge_hidden;
+
+    for (int c = 0; c < cnt; c++) {
+        if (tid < merge_hidden) {
+            float q = input_cast[tid];
+            int8_t* q_int8 = (int8_t*)&q;
+
+            float2 q_f;
+            __half* q_h = (__half*)&q_f;
+
+            q_h[0] = __float2half(local_scale * (float)q_int8[0]);
+            q_h[1] = __float2half(local_scale * (float)q_int8[1]);
+            q_h[2] = __float2half(local_scale * (float)q_int8[2]);
+            q_h[3] = __float2half(local_scale * (float)q_int8[3]);
+            output_cast[tid] = q_f;
+            tid += blockDim.x;
+        }
+    }
+}
+
+template <typename T>
+void launch_dequantize(T* output,
+                       const int8_t* input,
+                       const float* qscale,
+                       unsigned output_size,
+                       unsigned hidden_dim,
+                       unsigned groups,
+                       cudaStream_t stream)
+{
+    unsigned threads = 1024;
+    hidden_dim /= 4;
+    unsigned hid_cnt = threads / hidden_dim;
+    unsigned thd_cnt = (hidden_dim - 1) / threads + 1;
+    hid_cnt = hid_cnt > 0 ? hid_cnt : 1;
+
+    unsigned blocks = (output_size + hid_cnt * groups - 1) / (hid_cnt * groups);
+    dim3 block_dims(threads);
+    dim3 grid_dims(groups, blocks);
+
+    dequantize_kernel<<<grid_dims, block_dims, 0, stream>>>(
+        output, input, qscale, hidden_dim, hid_cnt * hidden_dim, thd_cnt);
+}
+
+template void launch_dequantize<float>(float*,
+                                       const int8_t*,
+                                       const float*,
+                                       unsigned,
+                                       unsigned,
+                                       unsigned,
+                                       cudaStream_t);
+template void launch_dequantize<__half>(__half*,
+                                        const int8_t*,
+                                        const float*,
+                                        unsigned,
+                                        unsigned,
+                                        unsigned,
+                                        cudaStream_t);
diff --git a/csrc/transformer/inference/csrc/dequantize.hip b/csrc/transformer/inference/csrc/dequantize.hip
index 7c22e306aace1058947ed47e58c0427a4f066ecb..8b26902d80dca4ca2387f8dfd3d8bea7093bfd36 100644
--- a/csrc/transformer/inference/csrc/dequantize.hip
+++ b/csrc/transformer/inference/csrc/dequantize.hip
@@ -1,6 +1,10 @@
 // !!! This is a file automatically generated by hipify!!!
 #include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#include "inference_cuda_layers.h"
 
 #define MAX_QUANTIZE_GROUPING 1024
 
@@ -48,8 +52,6 @@ __global__ void dequantize_kernel(__half* output,
                                   unsigned groups,
                                   unsigned merge_count)
 {
-#ifdef HALF_PRECISION_AVAILABLE
-
     unsigned merge_hidden = hidden_dim >> merge_count;
     unsigned quantization_stride = (merge_hidden * output_size) / groups;
 
@@ -73,7 +75,6 @@ __global__ void dequantize_kernel(__half* output,
         output[q_index] = __float2half(scale_data * (float)q);
         tid += blockDim.x;
     }
-#endif
 }
 
 template <typename T>
@@ -110,3 +111,86 @@ template void launch_dequantize<__half>(__half*,
                                         unsigned,
                                         unsigned,
                                         hipStream_t);
+
+__global__ void dequantize_kernel(float* output,
+                                  const int8_t* input,
+                                  const float* qscale,
+                                  int hidden_dim,
+                                  unsigned merge_hidden,
+                                  int cnt)
+{
+}
+
+__global__ void dequantize_kernel(__half* output,
+                                  const int8_t* input,
+                                  const float* qscale,
+                                  unsigned hidden_dim,
+                                  unsigned merge_hidden,
+                                  int cnt)
+{
+    unsigned bid = blockIdx.x * gridDim.y + blockIdx.y;
+    unsigned tid = threadIdx.x;
+
+    float local_scale = qscale[blockIdx.x];
+
+    const float* input_cast = reinterpret_cast<const float*>(input);
+    float2* output_cast = reinterpret_cast<float2*>(output);
+
+    input_cast += bid * merge_hidden;
+    output_cast += bid * merge_hidden;
+
+    for (int c = 0; c < cnt; c++) {
+        if (tid < merge_hidden) {
+            float q = input_cast[tid];
+            int8_t* q_int8 = (int8_t*)&q;
+
+            float2 q_f;
+            __half* q_h = (__half*)&q_f;
+
+            q_h[0] = __float2half(local_scale * (float)q_int8[0]);
+            q_h[1] = __float2half(local_scale * (float)q_int8[1]);
+            q_h[2] = __float2half(local_scale * (float)q_int8[2]);
+            q_h[3] = __float2half(local_scale * (float)q_int8[3]);
+            output_cast[tid] = q_f;
+            tid += blockDim.x;
+        }
+    }
+}
+
+template <typename T>
+void launch_dequantize(T* output,
+                       const int8_t* input,
+                       const float* qscale,
+                       unsigned output_size,
+                       unsigned hidden_dim,
+                       unsigned groups,
+                       hipStream_t stream)
+{
+    unsigned threads = 1024;
+    hidden_dim /= 4;
+    unsigned hid_cnt = threads / hidden_dim;
+    unsigned thd_cnt = (hidden_dim - 1) / threads + 1;
+    hid_cnt = hid_cnt > 0 ? hid_cnt : 1;
+
+    unsigned blocks = (output_size + hid_cnt * groups - 1) / (hid_cnt * groups);
+    dim3 block_dims(threads);
+    dim3 grid_dims(groups, blocks);
+
+   hipLaunchKernelGGL(( dequantize_kernel), dim3(grid_dims), dim3(block_dims), 0, stream, 
+        output, input, qscale, hidden_dim, hid_cnt * hidden_dim, thd_cnt);
+}
+
+template void launch_dequantize<float>(float*,
+                                       const int8_t*,
+                                       const float*,
+                                       unsigned,
+                                       unsigned,
+                                       unsigned,
+                                       hipStream_t);
+template void launch_dequantize<__half>(__half*,
+                                        const int8_t*,
+                                        const float*,
+                                        unsigned,
+                                        unsigned,
+                                        unsigned,
+                                        hipStream_t);
diff --git a/csrc/transformer/inference/csrc/gelu.cu b/csrc/transformer/inference/csrc/gelu.cu
index 70bbf42cf9ed74558ce1b789d939c17d38573a86..71a37bb368c798df32515d8df6baddc3cd6c415b 100644
--- a/csrc/transformer/inference/csrc/gelu.cu
+++ b/csrc/transformer/inference/csrc/gelu.cu
@@ -1,5 +1,12 @@
-#include "custom_cuda_layers.h"
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
 
+#include "conversion_utils.h"
+#include "inference_cuda_layers.h"
+#include "memory_access_utils.h"
+
+namespace cg = cooperative_groups;
 #define MAX_CAP 4
 #define MAX_SEQ 2048
 
@@ -10,74 +17,32 @@ inline __device__ float gelu(const float x)
     return x * 0.5f * (1.0f + tanhf(sqrt_param * (x + mul_param * x * x * x)));
 }
 
-__global__ void fused_bias_gelu(float* input,
-                                const float* bias,
-                                int total_count,
-                                int intermediate_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 bias_data = bias_cast[offset % intermediate_size];
-
-        data.x += bias_data.x;
-        data.y += bias_data.y;
-        data.z += bias_data.z;
-        data.w += bias_data.w;
-
-        data.x = gelu(data.x);
-        data.y = gelu(data.y);
-        data.z = gelu(data.z);
-        data.w = gelu(data.w);
-
-        input_cast[offset] = data;
-    }
-}
-
-__global__ void fused_bias_gelu(__half* input,
-                                const __half* bias,
-                                int total_count,
-                                int intermediate_size)
+/*
+In-place gelu(biasAdd(x)) for channels last
+*/
+template <typename T>
+__global__ void fused_bias_gelu(T* input, const T* bias, int total_count, int intermediate_size)
 {
-#ifdef HALF_PRECISION_AVAILABLE
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
+    // Input restriction: intermediate_size % vals_per_access == 0
+    constexpr int granularity = 16;
+    constexpr int values_per_access = granularity / sizeof(T);
+    const int offset = (blockIdx.x * blockDim.x + threadIdx.x) * values_per_access;
 
     if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 bias_vec = bias_cast[offset % intermediate_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        low_data.x += low_bias.x;
-        low_data.y += low_bias.y;
-        high_data.x += high_bias.x;
-        high_data.y += high_bias.y;
-
-        low_data.x = gelu(low_data.x);
-        low_data.y = gelu(low_data.y);
-        high_data.x = gelu(high_data.x);
-        high_data.y = gelu(high_data.y);
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        input_cast[offset] = vals_vec;
+        T data[values_per_access];
+        T data_bias[values_per_access];
+        mem_access::load_global<granularity>(data, input + offset);
+        mem_access::load_global<granularity>(data_bias, bias + (offset % intermediate_size));
+
+#pragma unroll
+        for (int i = 0; i < values_per_access; i++) {
+            float data_f = conversion::to<float>(data[i]);
+            float bias_f = conversion::to<float>(data_bias[i]);
+            data[i] = conversion::to<T>(gelu(data_f + bias_f));
+        }
+
+        mem_access::store_global<granularity>(input + offset, data);
     }
-#endif
 }
 
 template <typename T>
@@ -87,316 +52,324 @@ void launch_bias_gelu(T* input,
                       int batch_size,
                       cudaStream_t stream)
 {
-    int total_count = batch_size * (intermediate_size / 4);
-    int threads = 1024;  // intermediate_size / iterations / 4;
+    constexpr int threads = 1024;
+    constexpr int granularity = 16;
+
+    const int total_count = batch_size * intermediate_size;
+    const int elems_per_block = threads * (granularity / sizeof(T));
     dim3 block_dims(threads);
-    dim3 grid_dims(((total_count - 1) / 1024 + 1));  // (batch_size);
+    dim3 grid_dims((total_count + elems_per_block - 1) / elems_per_block);
 
     fused_bias_gelu<<<grid_dims, block_dims, 0, stream>>>(
-        input, bias, total_count, intermediate_size / 4);
+        input, bias, total_count, intermediate_size);
 }
 
 template void launch_bias_gelu<float>(float*, const float*, int, int, cudaStream_t);
 template void launch_bias_gelu<__half>(__half*, const __half*, int, int, cudaStream_t);
 
-__global__ void fused_bias_add(float* input, const float* bias, int total_count, int hidden_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 bias_data = bias_cast[offset % hidden_size];
-
-        data.x += bias_data.x;
-        data.y += bias_data.y;
-        data.z += bias_data.z;
-        data.w += bias_data.w;
-
-        input_cast[offset] = data;
-    }
-}
-
-__global__ void fused_bias_add(__half* input, const __half* bias, int total_count, int hidden_size)
+/*
+In-place channels-last bias add
+*/
+template <typename T>
+__global__ void fused_bias_add(T* input, const T* bias, int total_count, int intermediate_size)
 {
-#ifdef HALF_PRECISION_AVAILABLE
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
+    // Input restriction: intermediate_size % vals_per_access == 0
+    constexpr int granularity = 16;
+    constexpr int values_per_access = granularity / sizeof(T);
+    const int offset = (blockIdx.x * blockDim.x + threadIdx.x) * values_per_access;
 
     if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 bias_vec = bias_cast[offset % hidden_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        low_data.x += low_bias.x;
-        low_data.y += low_bias.y;
-        high_data.x += high_bias.x;
-        high_data.y += high_bias.y;
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        input_cast[offset] = vals_vec;
+        T data[values_per_access];
+        T data_bias[values_per_access];
+        mem_access::load_global<granularity>(data, input + offset);
+        mem_access::load_global<granularity>(data_bias, bias + (offset % intermediate_size));
+
+#pragma unroll
+        for (int i = 0; i < values_per_access; i++) {
+            float data_f = conversion::to<float>(data[i]);
+            float bias_f = conversion::to<float>(data_bias[i]);
+            data[i] = conversion::to<T>(data_f + bias_f);
+        }
+
+        mem_access::store_global<granularity>(input + offset, data);
     }
-#endif
 }
 
 template <typename T>
-void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, cudaStream_t stream)
+void launch_bias_add(T* input,
+                     const T* bias,
+                     int intermediate_size,
+                     int batch_size,
+                     cudaStream_t stream)
 {
-    int total_count = batch_size * (hidden_size / 4);
-    int threads = 1024;  // hidden_size / iterations / 4;
+    constexpr int threads = 1024;
+    constexpr int granularity = 16;
+
+    const int total_count = batch_size * intermediate_size;
+    const int elems_per_block = threads * (granularity / sizeof(T));
     dim3 block_dims(threads);
-    dim3 grid_dims(((total_count - 1) / threads + 1));  // (batch_size);
+    dim3 grid_dims((total_count + elems_per_block - 1) / elems_per_block);
 
-    fused_bias_add<<<grid_dims, block_dims, 0, stream>>>(input, bias, total_count, hidden_size / 4);
+    fused_bias_add<<<grid_dims, block_dims, 0, stream>>>(
+        input, bias, total_count, intermediate_size);
 }
 
 template void launch_bias_add<float>(float*, const float*, int, int, cudaStream_t);
 template void launch_bias_add<__half>(__half*, const __half*, int, int, cudaStream_t);
 
-__global__ void fused_bias_residual(float* input,
-                                    float* output,
-                                    float* attn,
-                                    float* bias,
-                                    float* attnbias,
-                                    int total_count,
-                                    int intermediate_size,
-                                    int mp_size)
+__global__ void fused_bias_residual(float* residual,
+                                    const float* hidden_state,
+                                    const float* attn,
+                                    const float* bias,
+                                    const float* attn_bias,
+                                    const int total_count,
+                                    const int intermediate_size,
+                                    const float mp_scale,
+                                    const bool preln)
 {
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    float4* output_cast = reinterpret_cast<float4*>(output);
-    float4* attn_cast = reinterpret_cast<float4*>(attn);
-    float4* bias_cast = reinterpret_cast<float4*>(bias);
-    float4* attnbias_cast = reinterpret_cast<float4*>(attnbias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
+    float4* res_fl4_ptr = reinterpret_cast<float4*>(residual);
+    const float4* hs_fl4_ptr = reinterpret_cast<const float4*>(hidden_state);
+    const float4* attn_fl4_ptr = reinterpret_cast<const float4*>(attn);
+    const float4* bias_fl4_ptr = reinterpret_cast<const float4*>(bias);
+    const float4* attn_bias_fl4_ptr = reinterpret_cast<const float4*>(attn_bias);
+    const int offset = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 out = output_cast[offset];
-        float4 res_vec = attn_cast[offset];
-        float4 bias_data = bias_cast[offset % intermediate_size];
-        float4 attn_bias = attnbias_cast[offset % intermediate_size];
-
-        data.x = (data.x + res_vec.x) * mp_size + (out.x + bias_data.x + attn_bias.x);
-        data.y = (data.y + res_vec.y) * mp_size + (out.y + bias_data.y + attn_bias.y);
-        data.z = (data.z + res_vec.z) * mp_size + (out.z + bias_data.z + attn_bias.z);
-        data.w = (data.w + res_vec.w) * mp_size + (out.w + bias_data.w + attn_bias.w);
-
-        output_cast[offset] = data;
+        float4 res_fl4 = res_fl4_ptr[offset];
+        const float4 hs_fl4 = hs_fl4_ptr[offset];
+        const float4 attn_fl4 = attn_fl4_ptr[offset];
+        const float4 bias_fl4 = bias_fl4_ptr[offset % intermediate_size];
+        const float4 attn_bias_fl4 = attn_bias_fl4_ptr[offset % intermediate_size];
+        if (preln) {
+            // residual = (residual + attention + bias + attention_bias) *
+            // mp_scale + hidden_state
+            res_fl4.x =
+                (res_fl4.x + attn_fl4.x + bias_fl4.x + attn_bias_fl4.x) * mp_scale + (hs_fl4.x);
+            res_fl4.y =
+                (res_fl4.y + attn_fl4.y + bias_fl4.y + attn_bias_fl4.y) * mp_scale + (hs_fl4.y);
+            res_fl4.z =
+                (res_fl4.z + attn_fl4.z + bias_fl4.z + attn_bias_fl4.z) * mp_scale + (hs_fl4.z);
+            res_fl4.w =
+                (res_fl4.w + attn_fl4.w + bias_fl4.w + attn_bias_fl4.w) * mp_scale + (hs_fl4.w);
+        } else {
+            // residual += hidden_state + bias
+            res_fl4.x = res_fl4.x + hs_fl4.x + bias_fl4.x;
+            res_fl4.y = res_fl4.y + hs_fl4.y + bias_fl4.y;
+            res_fl4.z = res_fl4.z + hs_fl4.z + bias_fl4.z;
+            res_fl4.w = res_fl4.w + hs_fl4.w + bias_fl4.w;
+        }
+        res_fl4_ptr[offset] = res_fl4;
     }
 }
 
-__global__ void fused_bias_residual(__half* input,
-                                    __half* output,
-                                    __half* attn,
-                                    __half* bias,
-                                    __half* attn_bias,
-                                    int total_count,
-                                    int intermediate_size,
-                                    int mp_size)
+__global__ void fused_bias_residual(__half* residual,
+                                    const __half* hidden_state,
+                                    const __half* attn,
+                                    const __half* bias,
+                                    const __half* attn_bias,
+                                    const int total_count,
+                                    const int intermediate_size,
+                                    const float mp_scale,
+                                    const bool preln)
 {
-#ifdef HALF_PRECISION_AVAILABLE
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    float2* output_cast = reinterpret_cast<float2*>(output);
-    float2* attn_cast = reinterpret_cast<float2*>(attn);
-
-    float2* bias_cast = reinterpret_cast<float2*>(bias);
-    float2* attnbias_cast = reinterpret_cast<float2*>(attn_bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
+    float2* res_fl2_ptr = reinterpret_cast<float2*>(residual);
+    const float2* hs_fl2_ptr = reinterpret_cast<const float2*>(hidden_state);
+    const float2* attn_fl2_ptr = reinterpret_cast<const float2*>(attn);
+    const float2* bias_fl2_ptr = reinterpret_cast<const float2*>(bias);
+    const float2* attn_bias_fl2_ptr = reinterpret_cast<const float2*>(attn_bias);
+    const int offset = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 out_vec = output_cast[offset];
-        float2 res_vec = attn_cast[offset];
-
-        float2 bias_vec = bias_cast[offset % intermediate_size];
-        float2 attn_bias_vec = attnbias_cast[offset % intermediate_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* out_half = reinterpret_cast<__half2*>(&out_vec);
-        __half2* res_half = reinterpret_cast<__half2*>(&res_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-        __half2* attnbias_half = reinterpret_cast<__half2*>(&attn_bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_out = __half22float2(out_half[0]);
-        float2 high_out = __half22float2(out_half[1]);
-
-        float2 low_res = __half22float2(res_half[0]);
-        float2 high_res = __half22float2(res_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        float2 attn_low_bias = __half22float2(attnbias_half[0]);
-        float2 attn_high_bias = __half22float2(attnbias_half[1]);
-
-        low_data.x =
-            (low_data.x + low_res.x) * mp_size + (low_out.x + (low_bias.x + attn_low_bias.x));
-        low_data.y =
-            (low_data.y + low_res.y) * mp_size + (low_out.y + (low_bias.y + attn_low_bias.y));
-        high_data.x =
-            (high_data.x + high_res.x) * mp_size + (high_out.x + (high_bias.x + attn_high_bias.x));
-        high_data.y =
-            (high_data.y + high_res.y) * mp_size + (high_out.y + (high_bias.y + attn_high_bias.y));
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        output_cast[offset] = vals_vec;
+        float2 res_fl2 = res_fl2_ptr[offset];
+        const float2 hs_fl2 = hs_fl2_ptr[offset];
+        const float2 attn_fl2 = attn_fl2_ptr[offset];
+        const float2 bias_fl2 = bias_fl2_ptr[offset % intermediate_size];
+        const float2 attn_bias_fl2 = attn_bias_fl2_ptr[offset % intermediate_size];
+
+        __half2* res_half2 = reinterpret_cast<__half2*>(&res_fl2);
+        const __half2* hs_half2 = reinterpret_cast<const __half2*>(&hs_fl2);
+        const __half2* attn_half2 = reinterpret_cast<const __half2*>(&attn_fl2);
+        const __half2* bias_half2 = reinterpret_cast<const __half2*>(&bias_fl2);
+        const __half2* attn_bias_half2 = reinterpret_cast<const __half2*>(&attn_bias_fl2);
+
+        float2 res_low = __half22float2(res_half2[0]);
+        float2 res_high = __half22float2(res_half2[1]);
+
+        const float2 hs_low = __half22float2(hs_half2[0]);
+        const float2 hs_high = __half22float2(hs_half2[1]);
+
+        const float2 attn_low = __half22float2(attn_half2[0]);
+        const float2 attn_high = __half22float2(attn_half2[1]);
+
+        const float2 bias_low = __half22float2(bias_half2[0]);
+        const float2 bias_high = __half22float2(bias_half2[1]);
+
+        const float2 attn_bias_low = __half22float2(attn_bias_half2[0]);
+        const float2 attn_bias_high = __half22float2(attn_bias_half2[1]);
+
+        if (preln) {
+            // residual = (residual + attention + bias + attention_bias) *
+            // mp_scale + hidden_state
+            res_low.x =
+                (res_low.x + attn_low.x + bias_low.x + attn_bias_low.x) * mp_scale + hs_low.x;
+            res_low.y =
+                (res_low.y + attn_low.y + bias_low.y + attn_bias_low.y) * mp_scale + hs_low.y;
+            res_high.x =
+                (res_high.x + attn_high.x + bias_high.x + attn_bias_high.x) * mp_scale + hs_high.x;
+            res_high.y =
+                (res_high.y + attn_high.y + bias_high.y + attn_bias_high.y) * mp_scale + hs_high.y;
+        } else {
+            // residual += hidden_state + bias
+            res_low.x = (res_low.x + hs_low.x + bias_low.x);
+            res_low.y = (res_low.y + hs_low.y + bias_low.y);
+            res_high.x = (res_high.x + hs_high.x + bias_high.x);
+            res_high.y = (res_high.y + hs_high.y + bias_high.y);
+        }
+        res_half2[0] = __float22half2_rn(res_low);
+        res_half2[1] = __float22half2_rn(res_high);
+
+        res_fl2_ptr[offset] = res_fl2;
     }
-#endif
 }
 
 template <typename T>
-void launch_bias_residual(T* input,
-                          T* output,
+void launch_bias_residual(T* residual,
+                          T* hidden_state,
                           T* attn,
                           T* bias,
                           T* attn_bias,
                           int batch,
                           int hidden_dim,
                           int mp_size,
+                          bool preln,
                           cudaStream_t stream)
 {
     int total_count = batch * hidden_dim / 4;
     dim3 block_dims(1024);
     dim3 grid_dims((total_count - 1) / 1024 + 1);  // (batch_size);
 
-    fused_bias_residual<<<grid_dims, block_dims, 0, stream>>>(
-        input, output, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size);
+    fused_bias_residual<<<grid_dims, block_dims, 0, stream>>>(residual,
+                                                              hidden_state,
+                                                              attn,
+                                                              bias,
+                                                              attn_bias,
+                                                              total_count,
+                                                              hidden_dim / 4,
+                                                              1.0 / mp_size,
+                                                              preln);
 }
 
-template void
-launch_bias_residual<float>(float*, float*, float*, float*, float*, int, int, int, cudaStream_t);
-template void launch_bias_residual<__half>(__half*,
-                                           __half*,
-                                           __half*,
-                                           __half*,
-                                           __half*,
-                                           int,
-                                           int,
-                                           int,
-                                           cudaStream_t);
-
-__global__ void gptj_residual_add(float* input,
-                                  float* output,
-                                  float* attn,
-                                  float* bias,
-                                  float* attnbias,
-                                  int total_count,
-                                  int intermediate_size,
-                                  float mp_size)
+template void launch_bias_residual<
+    float>(float*, float*, float*, float*, float*, int, int, int, bool, cudaStream_t);
+template void launch_bias_residual<
+    __half>(__half*, __half*, __half*, __half*, __half*, int, int, int, bool, cudaStream_t);
+
+__global__ void gptj_residual_add(float* residual,
+                                  const float* hidden_state,
+                                  const float* attn,
+                                  const float* bias,
+                                  const float* attn_bias,
+                                  const int total_count,
+                                  const int intermediate_size,
+                                  const float mp_scale)
 {
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    float4* output_cast = reinterpret_cast<float4*>(output);
-    float4* attn_cast = reinterpret_cast<float4*>(attn);
-    float4* bias_cast = reinterpret_cast<float4*>(bias);
-    float4* attnbias_cast = reinterpret_cast<float4*>(attnbias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
+    float4* res_fl4_ptr = reinterpret_cast<float4*>(residual);
+    const float4* hs_fl4_ptr = reinterpret_cast<const float4*>(hidden_state);
+    const float4* attn_fl4_ptr = reinterpret_cast<const float4*>(attn);
+    const float4* bias_fl4_ptr = reinterpret_cast<const float4*>(bias);
+    const float4* attn_bias_fl4_ptr = reinterpret_cast<const float4*>(attn_bias);
+    const int offset = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 out = output_cast[offset];
-        float4 res_vec = attn_cast[offset];
-        float4 bias_data = bias_cast[offset % intermediate_size];
-        float4 attn_bias = attnbias_cast[offset % intermediate_size];
-
-        data.x = data.x * mp_size + (out.x + res_vec.x + bias_data.x + attn_bias.x);
-        data.y = data.y * mp_size + (out.y + res_vec.y + bias_data.y + attn_bias.y);
-        data.z = data.z * mp_size + (out.z + res_vec.z + bias_data.z + attn_bias.z);
-        data.w = data.w * mp_size + (out.w + res_vec.w + bias_data.w + attn_bias.w);
-
-        output_cast[offset] = data;
+        float4 res_fl4 = res_fl4_ptr[offset];
+        const float4 hs_fl4 = hs_fl4_ptr[offset];
+        const float4 attn_fl4 = attn_fl4_ptr[offset];
+        const float4 bias_fl4 = bias_fl4_ptr[offset % intermediate_size];
+
+        if (attn_bias) {
+            float4 attn_bias_fl4 = attn_bias_fl4_ptr[offset % intermediate_size];
+            // residual += attention_bias
+            res_fl4.x += attn_bias_fl4.x;
+            res_fl4.y += attn_bias_fl4.y;
+            res_fl4.z += attn_bias_fl4.z;
+            res_fl4.w += attn_bias_fl4.w;
+        }
+        // residual = hidden_state + attention + (residual + bias) * mp_scale
+        res_fl4.x = hs_fl4.x + attn_fl4.x + (res_fl4.x + bias_fl4.x) * mp_scale;
+        res_fl4.y = hs_fl4.y + attn_fl4.y + (res_fl4.y + bias_fl4.y) * mp_scale;
+        res_fl4.z = hs_fl4.z + attn_fl4.z + (res_fl4.z + bias_fl4.z) * mp_scale;
+        res_fl4.w = hs_fl4.w + attn_fl4.w + (res_fl4.w + bias_fl4.w) * mp_scale;
+
+        res_fl4_ptr[offset] = res_fl4;
     }
 }
 
-__global__ void gptj_residual_add(__half* input,
-                                  __half* output,
-                                  __half* attn,
-                                  __half* bias,
-                                  __half* attn_bias,
-                                  int total_count,
-                                  int intermediate_size,
-                                  float mp_size)
+__global__ void gptj_residual_add(__half* residual,
+                                  const __half* hidden_state,
+                                  const __half* attn,
+                                  const __half* bias,
+                                  const __half* attn_bias,
+                                  const int total_count,
+                                  const int intermediate_size,
+                                  const float mp_scale)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    float2* output_cast = reinterpret_cast<float2*>(output);
-    float2* attn_cast = reinterpret_cast<float2*>(attn);
-
-    float2* bias_cast = reinterpret_cast<float2*>(bias);
-    float2* attnbias_cast = reinterpret_cast<float2*>(attn_bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
+    float2* res_fl2_ptr = reinterpret_cast<float2*>(residual);
+    const float2* hs_fl2_ptr = reinterpret_cast<const float2*>(hidden_state);
+    const float2* attn_fl2_ptr = reinterpret_cast<const float2*>(attn);
+    const float2* bias_fl2_ptr = reinterpret_cast<const float2*>(bias);
+    const float2* attn_bias_fl2_ptr = reinterpret_cast<const float2*>(attn_bias);
+    const int offset = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 out_vec = output_cast[offset];
-        float2 res_vec = attn_cast[offset];
-
-        float2 bias_vec = bias_cast[offset % intermediate_size];
-        float2 attn_bias_vec = attnbias_cast[offset % intermediate_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* out_half = reinterpret_cast<__half2*>(&out_vec);
-        __half2* res_half = reinterpret_cast<__half2*>(&res_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-        __half2* attnbias_half = reinterpret_cast<__half2*>(&attn_bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_out = __half22float2(out_half[0]);
-        float2 high_out = __half22float2(out_half[1]);
-
-        float2 low_res = __half22float2(res_half[0]);
-        float2 high_res = __half22float2(res_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        float2 attn_low_bias = __half22float2(attnbias_half[0]);
-        float2 attn_high_bias = __half22float2(attnbias_half[1]);
-
-        low_data.x =
-            low_data.x * mp_size + (low_out.x + low_res.x + (low_bias.x + attn_low_bias.x));
-        low_data.y =
-            low_data.y * mp_size + (low_out.y + low_res.y + (low_bias.y + attn_low_bias.y));
-        high_data.x =
-            high_data.x * mp_size + (high_out.x + high_res.x + (high_bias.x + attn_high_bias.x));
-        high_data.y =
-            high_data.y * mp_size + (high_out.y + high_res.y + (high_bias.y + attn_high_bias.y));
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        output_cast[offset] = vals_vec;
+        float2 res_fl2 = res_fl2_ptr[offset];
+        const float2 hs_fl2 = hs_fl2_ptr[offset];
+        const float2 attn_fl2 = attn_fl2_ptr[offset];
+        const float2 bias_fl2 = bias_fl2_ptr[offset % intermediate_size];
+
+        __half2* res_half2 = reinterpret_cast<__half2*>(&res_fl2);
+        const __half2* hs_half2 = reinterpret_cast<const __half2*>(&hs_fl2);
+        const __half2* attn_half2 = reinterpret_cast<const __half2*>(&attn_fl2);
+        const __half2* bias_half2 = reinterpret_cast<const __half2*>(&bias_fl2);
+
+        float2 res_low = __half22float2(res_half2[0]);
+        float2 res_high = __half22float2(res_half2[1]);
+
+        const float2 hs_low = __half22float2(hs_half2[0]);
+        const float2 hs_high = __half22float2(hs_half2[1]);
+
+        const float2 attn_low = __half22float2(attn_half2[0]);
+        const float2 attn_high = __half22float2(attn_half2[1]);
+
+        const float2 bias_low = __half22float2(bias_half2[0]);
+        const float2 bias_high = __half22float2(bias_half2[1]);
+
+        if (attn_bias) {
+            const float2 attn_bias_fl2 = attn_bias_fl2_ptr[offset % intermediate_size];
+            const __half2* attn_bias_half2 = reinterpret_cast<const __half2*>(&attn_bias_fl2);
+            const float2 attn_bias_low = __half22float2(attn_bias_half2[0]);
+            const float2 attn_bias_high = __half22float2(attn_bias_half2[1]);
+            // residual += attention_bias
+            res_low.x += attn_bias_low.x;
+            res_low.y += attn_bias_low.y;
+            res_high.x += attn_bias_high.x;
+            res_high.y += attn_bias_high.y;
+        }
+        // residual = hidden_state + attention + (residual + bias) * mp_scale
+        res_low.x = attn_low.x + hs_low.x + (res_low.x + bias_low.x) * mp_scale;
+        res_low.y = attn_low.y + hs_low.y + (res_low.y + bias_low.y) * mp_scale;
+        res_high.x = attn_high.x + hs_high.x + (res_high.x + bias_high.x) * mp_scale;
+        res_high.y = attn_high.y + hs_high.y + (res_high.y + bias_high.y) * mp_scale;
+
+        res_half2[0] = __float22half2_rn(res_low);
+        res_half2[1] = __float22half2_rn(res_high);
+
+        res_fl2_ptr[offset] = res_fl2;
     }
-#endif
 }
 
 template <typename T>
-void launch_gptj_residual_add(T* input,
-                              T* output,
+void launch_gptj_residual_add(T* residual,
+                              T* hidden_state,
                               T* attn,
                               T* bias,
                               T* attn_bias,
@@ -410,7 +383,7 @@ void launch_gptj_residual_add(T* input,
     dim3 grid_dims((total_count - 1) / 1024 + 1);  // (batch_size);
 
     gptj_residual_add<<<grid_dims, block_dims, 0, stream>>>(
-        input, output, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size);
+        residual, hidden_state, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size);
 }
 
 template void launch_gptj_residual_add<float>(float*,
@@ -431,69 +404,33 @@ template void launch_gptj_residual_add<__half>(__half*,
                                                int,
                                                int,
                                                cudaStream_t);
-
-__global__ void moe_res_matmul(float* residual,
-                               float* coef,
-                               float* mlp_out,
-                               int seq_len,
-                               int hidden_dim)
-{
-    unsigned tid = threadIdx.x;
-    float4* residual_cast = reinterpret_cast<float4*>(residual);
-    float4* coef_cast = reinterpret_cast<float4*>(coef);
-    float4* mlp_out_cast = reinterpret_cast<float4*>(mlp_out);
-
-    residual_cast += blockIdx.x * hidden_dim;
-    mlp_out_cast += blockIdx.x * hidden_dim;
-
-    float4* coef_cast2 = coef_cast + hidden_dim;
-
-    while (tid < hidden_dim) {
-        float4 res = residual_cast[tid];
-        float4 mlp = mlp_out_cast[tid];
-        float4 coef1 = coef_cast[tid];
-        float4 coef2 = coef_cast2[tid];
-        mlp.x = mlp.x * coef2.x + res.x * coef1.x;
-        mlp.y = mlp.y * coef2.y + res.y * coef1.y;
-        mlp.z = mlp.z * coef2.z + res.z * coef1.z;
-        mlp.w = mlp.w * coef2.w + res.w * coef1.w;
-        mlp_out_cast[tid] = mlp;
-        tid += blockDim.x;
-    }
-}
-
-__global__ void moe_res_matmul(__half* residual,
-                               __half* coef,
-                               __half* mlp_out,
-                               int seq_len,
-                               int hidden_dim)
+template <typename T>
+__global__ void moe_res_matmul(T* residual, T* coef, T* mlp_out, int seq_len, int hidden_dim)
 {
-    unsigned tid = threadIdx.x;
-
-    float2* residual_cast = reinterpret_cast<float2*>(residual);
-    float2* mlp_out_cast = reinterpret_cast<float2*>(mlp_out);
-    float2* coef_cast = reinterpret_cast<float2*>(coef);
-    float2* coef_cast2 = coef_cast + hidden_dim;
-
-    residual_cast += blockIdx.x * hidden_dim;
-    mlp_out_cast += blockIdx.x * hidden_dim;
-
-    while (tid < hidden_dim) {
-        float2 res = residual_cast[tid];
-        float2 coef1 = coef_cast[tid];
-        float2 coef2 = coef_cast[tid];
-        float2 data = mlp_out_cast[tid];
-        __half* data_h = reinterpret_cast<__half*>(&data);
-        __half* coef1_h = reinterpret_cast<__half*>(&coef1);
-        __half* coef2_h = reinterpret_cast<__half*>(&coef2);
-        __half* res_h = reinterpret_cast<__half*>(&res);
-        data_h[0] = res_h[0] * coef1_h[0] + data_h[0] * coef2_h[0];
-        data_h[1] = res_h[1] * coef1_h[1] + data_h[1] * coef2_h[1];
-        data_h[2] = res_h[2] * coef1_h[2] + data_h[2] * coef2_h[2];
-        data_h[3] = res_h[3] * coef1_h[3] + data_h[3] * coef2_h[3];
-
-        mlp_out_cast[tid] = data;
-        tid += blockDim.x;
+    constexpr int granularity = 16;
+    constexpr int vals_per_access = granularity / sizeof(T);
+
+    T* residual_seq = residual + blockIdx.x * hidden_dim;
+    T* mlp_out_seq = mlp_out + blockIdx.x * hidden_dim;
+
+    for (unsigned tid = threadIdx.x * vals_per_access; tid < hidden_dim;
+         tid += blockDim.x * vals_per_access) {
+        T mlp[vals_per_access];
+        T res[vals_per_access];
+        T coef1[vals_per_access];
+        T coef2[vals_per_access];
+
+        mem_access::load_global<granularity>(mlp, mlp_out_seq + tid);
+        mem_access::load_global<granularity>(res, residual_seq + tid);
+        mem_access::load_global<granularity>(coef1, coef + tid);
+        mem_access::load_global<granularity>(coef2, coef + tid + hidden_dim);
+
+#pragma unroll
+        for (int idx = 0; idx < vals_per_access; idx++) {
+            mlp[idx] = mlp[idx] * coef2[idx] + res[idx] * coef1[idx];
+        }
+
+        mem_access::store_global<granularity>(mlp_out_seq + tid, mlp);
     }
 }
 
@@ -508,7 +445,7 @@ void launch_moe_res_matmul(T* residual,
     dim3 grid_dim(seq_len);
     dim3 block_dim(1024);
     moe_res_matmul<<<grid_dim, block_dim, 0, stream>>>(
-        residual, coef, mlp_out, seq_len, hidden_dim / 4);
+        residual, coef, mlp_out, seq_len, hidden_dim);
 }
 
 template void launch_moe_res_matmul(float* residual,
@@ -523,3 +460,224 @@ template void launch_moe_res_matmul(__half* residual,
                                     int seq_len,
                                     int hidden_dim,
                                     cudaStream_t stream);
+
+__global__ void pad_data_kernel(__half* padded_output,
+                                __half* output,
+                                int head_size,
+                                int padded_head_size)
+{
+    float4* padded_output_cast = reinterpret_cast<float4*>(padded_output);
+    float4* output_cast = reinterpret_cast<float4*>(output);
+    int bid = blockIdx.x * (blockDim.y) + threadIdx.y;
+    int idx = threadIdx.x;
+    padded_output_cast += (bid * padded_head_size);
+    output_cast += (bid * head_size);
+    float4 ZERO;
+    const __half2 zero_h = __float2half2_rn(0.f);
+    __half2* ZERO_h = reinterpret_cast<__half2*>(&ZERO);
+#pragma unroll
+    for (int i = 0; i < 4; i++) ZERO_h[i] = zero_h;
+    if (idx < head_size)
+        padded_output_cast[idx] = output_cast[idx];
+    else
+        padded_output_cast[idx] = ZERO;
+}
+__global__ void pad_data_kernel(float* padded_output,
+                                float* output,
+                                int head_size,
+                                int padded_head_size)
+{
+}
+template <typename T>
+void pad_data(T* padded_output,
+              T* output,
+              int bsz,
+              int head_size,
+              int padded_head_size,
+              cudaStream_t stream)
+{
+    dim3 grid_dim((bsz - 1) / 16 + 1);
+    dim3 block_dim(padded_head_size / 8, 16);
+    pad_data_kernel<<<grid_dim, block_dim, 0, stream>>>(
+        padded_output, output, head_size / 8, padded_head_size / 8);
+}
+template void pad_data(__half* padded_output,
+                       __half* output,
+                       int bsz,
+                       int head_size,
+                       int padded_head_size,
+                       cudaStream_t stream);
+template void pad_data(float* padded_output,
+                       float* output,
+                       int bsz,
+                       int head_size,
+                       int padded_head_size,
+                       cudaStream_t stream);
+
+__global__ void pad_head_seq_kernel(__half* padded_output,
+                                    __half* output,
+                                    int seq_len,
+                                    int padded_seq_len,
+                                    int head_size,
+                                    int padded_head_size)
+{
+    float4* padded_output_cast = reinterpret_cast<float4*>(padded_output);
+    float4* output_cast = reinterpret_cast<float4*>(output);
+    int bsz = blockIdx.x;
+    int bid = blockIdx.y * (blockDim.y) + threadIdx.y;
+    int idx = threadIdx.x;
+    padded_output_cast += (bsz * padded_seq_len + bid) * padded_head_size;
+    output_cast += (bsz * seq_len + bid) * head_size;
+    float4 ZERO;
+    const __half2 zero_h = __float2half2_rn(0.f);
+    __half2* ZERO_h = reinterpret_cast<__half2*>(&ZERO);
+#pragma unroll
+    for (int i = 0; i < 4; i++) ZERO_h[i] = zero_h;
+
+    if (idx < head_size && bid < seq_len)
+        padded_output_cast[idx] = output_cast[idx];
+    else
+        padded_output_cast[idx] = ZERO;
+}
+__global__ void pad_head_seq_kernel(float* padded_output,
+                                    float* output,
+                                    int seq_len,
+                                    int padded_seq_len,
+                                    int head_size,
+                                    int padded_head_size)
+{
+}
+template <typename T>
+void pad_head_seq(T* padded_output,
+                  T* output,
+                  int bsz,
+                  int seq_len,
+                  int padded_seq_len,
+                  int head_size,
+                  int padded_head_size,
+                  cudaStream_t stream)
+{
+    dim3 grid_dim(bsz, padded_seq_len / 16);
+    dim3 block_dim(padded_head_size / 8, 16);
+    pad_head_seq_kernel<<<grid_dim, block_dim, 0, stream>>>(
+        padded_output, output, seq_len, padded_seq_len, head_size / 8, padded_head_size / 8);
+}
+template void pad_head_seq(__half* padded_output,
+                           __half* output,
+                           int bsz,
+                           int seq_len,
+                           int padded_seq_len,
+                           int head_size,
+                           int padded_head_size,
+                           cudaStream_t stream);
+template void pad_head_seq(float* padded_output,
+                           float* output,
+                           int bsz,
+                           int seq_len,
+                           int padded_seq_len,
+                           int head_size,
+                           int padded_head_size,
+                           cudaStream_t stream);
+
+// TODO(cmikeh2): evaluate different GeLU performance
+__device__ __forceinline__ float old_gelu(float val)
+{
+    // 1 / sqrt(2)
+    constexpr float rsqrt_2 = 0.707106769084930419922;
+    return val * 0.5f * (1.0f + erff(val * rsqrt_2));
+}
+
+namespace fused_geglu {
+constexpr int threads = 256;
+constexpr int steps = 2;
+constexpr int granularity = 16;
+}  // namespace fused_geglu
+
+template <typename T>
+__global__ void fused_bias_geglu(T* output,
+                                 const T* activation,
+                                 const T* bias,
+                                 int base_channels,
+                                 int total_elems)
+{
+    constexpr int T_per_access = fused_geglu::granularity / sizeof(T);
+    constexpr int T_per_step = T_per_access * fused_geglu::threads;
+    constexpr int T_per_block = T_per_step * fused_geglu::steps;
+
+    const int id = blockIdx.x * T_per_block + threadIdx.x * T_per_access;
+
+#pragma unroll
+    for (int i = 0; i < fused_geglu::steps; i++) {
+        T activation_buffer_1[T_per_access];
+        T activation_buffer_2[T_per_access];
+        T bias_buffer_1[T_per_access];
+        T bias_buffer_2[T_per_access];
+
+        const int iter_id = id + T_per_step * i;
+        if (iter_id < total_elems) {
+            const int channel_id = iter_id % base_channels;
+            const int seq_id = iter_id / base_channels;
+            const int seq_offset = seq_id * base_channels * 2;
+
+            mem_access::load_global<fused_geglu::granularity>(activation_buffer_1,
+                                                              activation + seq_offset + channel_id);
+            mem_access::load_global<fused_geglu::granularity>(
+                activation_buffer_2, activation + seq_offset + channel_id + base_channels);
+            mem_access::load_global<fused_geglu::granularity>(bias_buffer_1, bias + channel_id);
+            mem_access::load_global<fused_geglu::granularity>(bias_buffer_2,
+                                                              bias + channel_id + base_channels);
+
+            // Since the GeLU is going to happen at float, might as well
+            // convert
+#pragma unroll
+            for (int v = 0; v < T_per_access; v++) {
+                T hidden_state = activation_buffer_1[v] + bias_buffer_1[v];
+                T pre_gate = activation_buffer_2[v] + bias_buffer_2[v];
+                float gate_f = old_gelu(conversion::to<float>(pre_gate));
+                T gate = conversion::to<T>(gate_f);
+                activation_buffer_1[v] = hidden_state * gate;
+            }
+
+            mem_access::store_global<fused_geglu::granularity>(output + iter_id,
+                                                               activation_buffer_1);
+        }
+    }
+}
+
+template <typename T>
+void launch_fused_bias_geglu(T* output,
+                             const T* activation,
+                             const T* bias,
+                             int rows,
+                             int elems_per_row,
+                             cudaStream_t stream)
+{
+    /*
+    Fused bias GEGLU is a variant of the gated activation functions.
+    The input here is a matrix of [batch, seq_len, 2 * intermediate_dim]
+    where the second half of the channels act as GeLU gates for the first
+    half.
+    */
+
+    // Re-derive the above figures
+    constexpr int T_per_access = fused_geglu::granularity / sizeof(T);
+    constexpr int T_per_step = T_per_access * fused_geglu::threads;
+    constexpr int T_per_block = T_per_step * fused_geglu::steps;
+
+    const int base_channels = elems_per_row / 2;
+    const int total_elems = base_channels * rows;
+
+    dim3 block(fused_geglu::threads);
+    dim3 grid((total_elems + T_per_block - 1) / T_per_block);
+
+    fused_bias_geglu<<<grid, block, 0, stream>>>(
+        output, activation, bias, base_channels, total_elems);
+}
+
+template void launch_fused_bias_geglu(__half*,
+                                      const __half*,
+                                      const __half*,
+                                      int,
+                                      int,
+                                      cudaStream_t);
+template void launch_fused_bias_geglu(float*, const float*, const float*, int, int, cudaStream_t);
diff --git a/csrc/transformer/inference/csrc/gelu.hip b/csrc/transformer/inference/csrc/gelu.hip
index 00c03efb9b6b3d7b05f19549472b5b771f46e1f4..6665406e4bc040acc387c0e61bab5dc855110cdb 100644
--- a/csrc/transformer/inference/csrc/gelu.hip
+++ b/csrc/transformer/inference/csrc/gelu.hip
@@ -1,7 +1,14 @@
 // !!! This is a file automatically generated by hipify!!!
 #include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
 
+#include "conversion_utils.h"
+#include "inference_cuda_layers.h"
+#include "memory_access_utils.h"
+
+namespace cg = cooperative_groups;
 #define MAX_CAP 4
 #define MAX_SEQ 2048
 
@@ -12,74 +19,32 @@ inline __device__ float gelu(const float x)
     return x * 0.5f * (1.0f + tanhf(sqrt_param * (x + mul_param * x * x * x)));
 }
 
-__global__ void fused_bias_gelu(float* input,
-                                const float* bias,
-                                int total_count,
-                                int intermediate_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 bias_data = bias_cast[offset % intermediate_size];
-
-        data.x += bias_data.x;
-        data.y += bias_data.y;
-        data.z += bias_data.z;
-        data.w += bias_data.w;
-
-        data.x = gelu(data.x);
-        data.y = gelu(data.y);
-        data.z = gelu(data.z);
-        data.w = gelu(data.w);
-
-        input_cast[offset] = data;
-    }
-}
-
-__global__ void fused_bias_gelu(__half* input,
-                                const __half* bias,
-                                int total_count,
-                                int intermediate_size)
+/*
+In-place gelu(biasAdd(x)) for channels last
+*/
+template <typename T>
+__global__ void fused_bias_gelu(T* input, const T* bias, int total_count, int intermediate_size)
 {
-#ifdef HALF_PRECISION_AVAILABLE
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
+    // Input restriction: intermediate_size % vals_per_access == 0
+    constexpr int granularity = 16;
+    constexpr int values_per_access = granularity / sizeof(T);
+    const int offset = (blockIdx.x * blockDim.x + threadIdx.x) * values_per_access;
 
     if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 bias_vec = bias_cast[offset % intermediate_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        low_data.x += low_bias.x;
-        low_data.y += low_bias.y;
-        high_data.x += high_bias.x;
-        high_data.y += high_bias.y;
-
-        low_data.x = gelu(low_data.x);
-        low_data.y = gelu(low_data.y);
-        high_data.x = gelu(high_data.x);
-        high_data.y = gelu(high_data.y);
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        input_cast[offset] = vals_vec;
+        T data[values_per_access];
+        T data_bias[values_per_access];
+        mem_access::load_global<granularity>(data, input + offset);
+        mem_access::load_global<granularity>(data_bias, bias + (offset % intermediate_size));
+
+#pragma unroll
+        for (int i = 0; i < values_per_access; i++) {
+            float data_f = conversion::to<float>(data[i]);
+            float bias_f = conversion::to<float>(data_bias[i]);
+            data[i] = conversion::to<T>(gelu(data_f + bias_f));
+        }
+
+        mem_access::store_global<granularity>(input + offset, data);
     }
-#endif
 }
 
 template <typename T>
@@ -89,316 +54,324 @@ void launch_bias_gelu(T* input,
                       int batch_size,
                       hipStream_t stream)
 {
-    int total_count = batch_size * (intermediate_size / 4);
-    int threads = 1024;  // intermediate_size / iterations / 4;
+    constexpr int threads = 1024;
+    constexpr int granularity = 16;
+
+    const int total_count = batch_size * intermediate_size;
+    const int elems_per_block = threads * (granularity / sizeof(T));
     dim3 block_dims(threads);
-    dim3 grid_dims(((total_count - 1) / 1024 + 1));  // (batch_size);
+    dim3 grid_dims((total_count + elems_per_block - 1) / elems_per_block);
 
    hipLaunchKernelGGL(( fused_bias_gelu), dim3(grid_dims), dim3(block_dims), 0, stream, 
-        input, bias, total_count, intermediate_size / 4);
+        input, bias, total_count, intermediate_size);
 }
 
 template void launch_bias_gelu<float>(float*, const float*, int, int, hipStream_t);
 template void launch_bias_gelu<__half>(__half*, const __half*, int, int, hipStream_t);
 
-__global__ void fused_bias_add(float* input, const float* bias, int total_count, int hidden_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 bias_data = bias_cast[offset % hidden_size];
-
-        data.x += bias_data.x;
-        data.y += bias_data.y;
-        data.z += bias_data.z;
-        data.w += bias_data.w;
-
-        input_cast[offset] = data;
-    }
-}
-
-__global__ void fused_bias_add(__half* input, const __half* bias, int total_count, int hidden_size)
+/*
+In-place channels-last bias add
+*/
+template <typename T>
+__global__ void fused_bias_add(T* input, const T* bias, int total_count, int intermediate_size)
 {
-#ifdef HALF_PRECISION_AVAILABLE
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
+    // Input restriction: intermediate_size % vals_per_access == 0
+    constexpr int granularity = 16;
+    constexpr int values_per_access = granularity / sizeof(T);
+    const int offset = (blockIdx.x * blockDim.x + threadIdx.x) * values_per_access;
 
     if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 bias_vec = bias_cast[offset % hidden_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        low_data.x += low_bias.x;
-        low_data.y += low_bias.y;
-        high_data.x += high_bias.x;
-        high_data.y += high_bias.y;
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        input_cast[offset] = vals_vec;
+        T data[values_per_access];
+        T data_bias[values_per_access];
+        mem_access::load_global<granularity>(data, input + offset);
+        mem_access::load_global<granularity>(data_bias, bias + (offset % intermediate_size));
+
+#pragma unroll
+        for (int i = 0; i < values_per_access; i++) {
+            float data_f = conversion::to<float>(data[i]);
+            float bias_f = conversion::to<float>(data_bias[i]);
+            data[i] = conversion::to<T>(data_f + bias_f);
+        }
+
+        mem_access::store_global<granularity>(input + offset, data);
     }
-#endif
 }
 
 template <typename T>
-void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, hipStream_t stream)
+void launch_bias_add(T* input,
+                     const T* bias,
+                     int intermediate_size,
+                     int batch_size,
+                     hipStream_t stream)
 {
-    int total_count = batch_size * (hidden_size / 4);
-    int threads = 1024;  // hidden_size / iterations / 4;
+    constexpr int threads = 1024;
+    constexpr int granularity = 16;
+
+    const int total_count = batch_size * intermediate_size;
+    const int elems_per_block = threads * (granularity / sizeof(T));
     dim3 block_dims(threads);
-    dim3 grid_dims(((total_count - 1) / threads + 1));  // (batch_size);
+    dim3 grid_dims((total_count + elems_per_block - 1) / elems_per_block);
 
-   hipLaunchKernelGGL(( fused_bias_add), dim3(grid_dims), dim3(block_dims), 0, stream, input, bias, total_count, hidden_size / 4);
+   hipLaunchKernelGGL(( fused_bias_add), dim3(grid_dims), dim3(block_dims), 0, stream, 
+        input, bias, total_count, intermediate_size);
 }
 
 template void launch_bias_add<float>(float*, const float*, int, int, hipStream_t);
 template void launch_bias_add<__half>(__half*, const __half*, int, int, hipStream_t);
 
-__global__ void fused_bias_residual(float* input,
-                                    float* output,
-                                    float* attn,
-                                    float* bias,
-                                    float* attnbias,
-                                    int total_count,
-                                    int intermediate_size,
-                                    int mp_size)
+__global__ void fused_bias_residual(float* residual,
+                                    const float* hidden_state,
+                                    const float* attn,
+                                    const float* bias,
+                                    const float* attn_bias,
+                                    const int total_count,
+                                    const int intermediate_size,
+                                    const float mp_scale,
+                                    const bool preln)
 {
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    float4* output_cast = reinterpret_cast<float4*>(output);
-    float4* attn_cast = reinterpret_cast<float4*>(attn);
-    float4* bias_cast = reinterpret_cast<float4*>(bias);
-    float4* attnbias_cast = reinterpret_cast<float4*>(attnbias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
+    float4* res_fl4_ptr = reinterpret_cast<float4*>(residual);
+    const float4* hs_fl4_ptr = reinterpret_cast<const float4*>(hidden_state);
+    const float4* attn_fl4_ptr = reinterpret_cast<const float4*>(attn);
+    const float4* bias_fl4_ptr = reinterpret_cast<const float4*>(bias);
+    const float4* attn_bias_fl4_ptr = reinterpret_cast<const float4*>(attn_bias);
+    const int offset = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 out = output_cast[offset];
-        float4 res_vec = attn_cast[offset];
-        float4 bias_data = bias_cast[offset % intermediate_size];
-        float4 attn_bias = attnbias_cast[offset % intermediate_size];
-
-        data.x = (data.x + res_vec.x) * mp_size + (out.x + bias_data.x + attn_bias.x);
-        data.y = (data.y + res_vec.y) * mp_size + (out.y + bias_data.y + attn_bias.y);
-        data.z = (data.z + res_vec.z) * mp_size + (out.z + bias_data.z + attn_bias.z);
-        data.w = (data.w + res_vec.w) * mp_size + (out.w + bias_data.w + attn_bias.w);
-
-        output_cast[offset] = data;
+        float4 res_fl4 = res_fl4_ptr[offset];
+        const float4 hs_fl4 = hs_fl4_ptr[offset];
+        const float4 attn_fl4 = attn_fl4_ptr[offset];
+        const float4 bias_fl4 = bias_fl4_ptr[offset % intermediate_size];
+        const float4 attn_bias_fl4 = attn_bias_fl4_ptr[offset % intermediate_size];
+        if (preln) {
+            // residual = (residual + attention + bias + attention_bias) *
+            // mp_scale + hidden_state
+            res_fl4.x =
+                (res_fl4.x + attn_fl4.x + bias_fl4.x + attn_bias_fl4.x) * mp_scale + (hs_fl4.x);
+            res_fl4.y =
+                (res_fl4.y + attn_fl4.y + bias_fl4.y + attn_bias_fl4.y) * mp_scale + (hs_fl4.y);
+            res_fl4.z =
+                (res_fl4.z + attn_fl4.z + bias_fl4.z + attn_bias_fl4.z) * mp_scale + (hs_fl4.z);
+            res_fl4.w =
+                (res_fl4.w + attn_fl4.w + bias_fl4.w + attn_bias_fl4.w) * mp_scale + (hs_fl4.w);
+        } else {
+            // residual += hidden_state + bias
+            res_fl4.x = res_fl4.x + hs_fl4.x + bias_fl4.x;
+            res_fl4.y = res_fl4.y + hs_fl4.y + bias_fl4.y;
+            res_fl4.z = res_fl4.z + hs_fl4.z + bias_fl4.z;
+            res_fl4.w = res_fl4.w + hs_fl4.w + bias_fl4.w;
+        }
+        res_fl4_ptr[offset] = res_fl4;
     }
 }
 
-__global__ void fused_bias_residual(__half* input,
-                                    __half* output,
-                                    __half* attn,
-                                    __half* bias,
-                                    __half* attn_bias,
-                                    int total_count,
-                                    int intermediate_size,
-                                    int mp_size)
+__global__ void fused_bias_residual(__half* residual,
+                                    const __half* hidden_state,
+                                    const __half* attn,
+                                    const __half* bias,
+                                    const __half* attn_bias,
+                                    const int total_count,
+                                    const int intermediate_size,
+                                    const float mp_scale,
+                                    const bool preln)
 {
-#ifdef HALF_PRECISION_AVAILABLE
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    float2* output_cast = reinterpret_cast<float2*>(output);
-    float2* attn_cast = reinterpret_cast<float2*>(attn);
-
-    float2* bias_cast = reinterpret_cast<float2*>(bias);
-    float2* attnbias_cast = reinterpret_cast<float2*>(attn_bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
+    float2* res_fl2_ptr = reinterpret_cast<float2*>(residual);
+    const float2* hs_fl2_ptr = reinterpret_cast<const float2*>(hidden_state);
+    const float2* attn_fl2_ptr = reinterpret_cast<const float2*>(attn);
+    const float2* bias_fl2_ptr = reinterpret_cast<const float2*>(bias);
+    const float2* attn_bias_fl2_ptr = reinterpret_cast<const float2*>(attn_bias);
+    const int offset = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 out_vec = output_cast[offset];
-        float2 res_vec = attn_cast[offset];
-
-        float2 bias_vec = bias_cast[offset % intermediate_size];
-        float2 attn_bias_vec = attnbias_cast[offset % intermediate_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* out_half = reinterpret_cast<__half2*>(&out_vec);
-        __half2* res_half = reinterpret_cast<__half2*>(&res_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-        __half2* attnbias_half = reinterpret_cast<__half2*>(&attn_bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_out = __half22float2(out_half[0]);
-        float2 high_out = __half22float2(out_half[1]);
-
-        float2 low_res = __half22float2(res_half[0]);
-        float2 high_res = __half22float2(res_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        float2 attn_low_bias = __half22float2(attnbias_half[0]);
-        float2 attn_high_bias = __half22float2(attnbias_half[1]);
-
-        low_data.x =
-            (low_data.x + low_res.x) * mp_size + (low_out.x + (low_bias.x + attn_low_bias.x));
-        low_data.y =
-            (low_data.y + low_res.y) * mp_size + (low_out.y + (low_bias.y + attn_low_bias.y));
-        high_data.x =
-            (high_data.x + high_res.x) * mp_size + (high_out.x + (high_bias.x + attn_high_bias.x));
-        high_data.y =
-            (high_data.y + high_res.y) * mp_size + (high_out.y + (high_bias.y + attn_high_bias.y));
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        output_cast[offset] = vals_vec;
+        float2 res_fl2 = res_fl2_ptr[offset];
+        const float2 hs_fl2 = hs_fl2_ptr[offset];
+        const float2 attn_fl2 = attn_fl2_ptr[offset];
+        const float2 bias_fl2 = bias_fl2_ptr[offset % intermediate_size];
+        const float2 attn_bias_fl2 = attn_bias_fl2_ptr[offset % intermediate_size];
+
+        __half2* res_half2 = reinterpret_cast<__half2*>(&res_fl2);
+        const __half2* hs_half2 = reinterpret_cast<const __half2*>(&hs_fl2);
+        const __half2* attn_half2 = reinterpret_cast<const __half2*>(&attn_fl2);
+        const __half2* bias_half2 = reinterpret_cast<const __half2*>(&bias_fl2);
+        const __half2* attn_bias_half2 = reinterpret_cast<const __half2*>(&attn_bias_fl2);
+
+        float2 res_low = __half22float2(res_half2[0]);
+        float2 res_high = __half22float2(res_half2[1]);
+
+        const float2 hs_low = __half22float2(hs_half2[0]);
+        const float2 hs_high = __half22float2(hs_half2[1]);
+
+        const float2 attn_low = __half22float2(attn_half2[0]);
+        const float2 attn_high = __half22float2(attn_half2[1]);
+
+        const float2 bias_low = __half22float2(bias_half2[0]);
+        const float2 bias_high = __half22float2(bias_half2[1]);
+
+        const float2 attn_bias_low = __half22float2(attn_bias_half2[0]);
+        const float2 attn_bias_high = __half22float2(attn_bias_half2[1]);
+
+        if (preln) {
+            // residual = (residual + attention + bias + attention_bias) *
+            // mp_scale + hidden_state
+            res_low.x =
+                (res_low.x + attn_low.x + bias_low.x + attn_bias_low.x) * mp_scale + hs_low.x;
+            res_low.y =
+                (res_low.y + attn_low.y + bias_low.y + attn_bias_low.y) * mp_scale + hs_low.y;
+            res_high.x =
+                (res_high.x + attn_high.x + bias_high.x + attn_bias_high.x) * mp_scale + hs_high.x;
+            res_high.y =
+                (res_high.y + attn_high.y + bias_high.y + attn_bias_high.y) * mp_scale + hs_high.y;
+        } else {
+            // residual += hidden_state + bias
+            res_low.x = (res_low.x + hs_low.x + bias_low.x);
+            res_low.y = (res_low.y + hs_low.y + bias_low.y);
+            res_high.x = (res_high.x + hs_high.x + bias_high.x);
+            res_high.y = (res_high.y + hs_high.y + bias_high.y);
+        }
+        res_half2[0] = __float22half2_rn(res_low);
+        res_half2[1] = __float22half2_rn(res_high);
+
+        res_fl2_ptr[offset] = res_fl2;
     }
-#endif
 }
 
 template <typename T>
-void launch_bias_residual(T* input,
-                          T* output,
+void launch_bias_residual(T* residual,
+                          T* hidden_state,
                           T* attn,
                           T* bias,
                           T* attn_bias,
                           int batch,
                           int hidden_dim,
                           int mp_size,
+                          bool preln,
                           hipStream_t stream)
 {
     int total_count = batch * hidden_dim / 4;
     dim3 block_dims(1024);
     dim3 grid_dims((total_count - 1) / 1024 + 1);  // (batch_size);
 
-   hipLaunchKernelGGL(( fused_bias_residual), dim3(grid_dims), dim3(block_dims), 0, stream, 
-        input, output, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size);
+   hipLaunchKernelGGL(( fused_bias_residual), dim3(grid_dims), dim3(block_dims), 0, stream, residual,
+                                                              hidden_state,
+                                                              attn,
+                                                              bias,
+                                                              attn_bias,
+                                                              total_count,
+                                                              hidden_dim / 4,
+                                                              1.0 / mp_size,
+                                                              preln);
 }
 
-template void
-launch_bias_residual<float>(float*, float*, float*, float*, float*, int, int, int, hipStream_t);
-template void launch_bias_residual<__half>(__half*,
-                                           __half*,
-                                           __half*,
-                                           __half*,
-                                           __half*,
-                                           int,
-                                           int,
-                                           int,
-                                           hipStream_t);
-
-__global__ void gptj_residual_add(float* input,
-                                  float* output,
-                                  float* attn,
-                                  float* bias,
-                                  float* attnbias,
-                                  int total_count,
-                                  int intermediate_size,
-                                  float mp_size)
+template void launch_bias_residual<
+    float>(float*, float*, float*, float*, float*, int, int, int, bool, hipStream_t);
+template void launch_bias_residual<
+    __half>(__half*, __half*, __half*, __half*, __half*, int, int, int, bool, hipStream_t);
+
+__global__ void gptj_residual_add(float* residual,
+                                  const float* hidden_state,
+                                  const float* attn,
+                                  const float* bias,
+                                  const float* attn_bias,
+                                  const int total_count,
+                                  const int intermediate_size,
+                                  const float mp_scale)
 {
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    float4* output_cast = reinterpret_cast<float4*>(output);
-    float4* attn_cast = reinterpret_cast<float4*>(attn);
-    float4* bias_cast = reinterpret_cast<float4*>(bias);
-    float4* attnbias_cast = reinterpret_cast<float4*>(attnbias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
+    float4* res_fl4_ptr = reinterpret_cast<float4*>(residual);
+    const float4* hs_fl4_ptr = reinterpret_cast<const float4*>(hidden_state);
+    const float4* attn_fl4_ptr = reinterpret_cast<const float4*>(attn);
+    const float4* bias_fl4_ptr = reinterpret_cast<const float4*>(bias);
+    const float4* attn_bias_fl4_ptr = reinterpret_cast<const float4*>(attn_bias);
+    const int offset = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 out = output_cast[offset];
-        float4 res_vec = attn_cast[offset];
-        float4 bias_data = bias_cast[offset % intermediate_size];
-        float4 attn_bias = attnbias_cast[offset % intermediate_size];
-
-        data.x = data.x * mp_size + (out.x + res_vec.x + bias_data.x + attn_bias.x);
-        data.y = data.y * mp_size + (out.y + res_vec.y + bias_data.y + attn_bias.y);
-        data.z = data.z * mp_size + (out.z + res_vec.z + bias_data.z + attn_bias.z);
-        data.w = data.w * mp_size + (out.w + res_vec.w + bias_data.w + attn_bias.w);
-
-        output_cast[offset] = data;
+        float4 res_fl4 = res_fl4_ptr[offset];
+        const float4 hs_fl4 = hs_fl4_ptr[offset];
+        const float4 attn_fl4 = attn_fl4_ptr[offset];
+        const float4 bias_fl4 = bias_fl4_ptr[offset % intermediate_size];
+
+        if (attn_bias) {
+            float4 attn_bias_fl4 = attn_bias_fl4_ptr[offset % intermediate_size];
+            // residual += attention_bias
+            res_fl4.x += attn_bias_fl4.x;
+            res_fl4.y += attn_bias_fl4.y;
+            res_fl4.z += attn_bias_fl4.z;
+            res_fl4.w += attn_bias_fl4.w;
+        }
+        // residual = hidden_state + attention + (residual + bias) * mp_scale
+        res_fl4.x = hs_fl4.x + attn_fl4.x + (res_fl4.x + bias_fl4.x) * mp_scale;
+        res_fl4.y = hs_fl4.y + attn_fl4.y + (res_fl4.y + bias_fl4.y) * mp_scale;
+        res_fl4.z = hs_fl4.z + attn_fl4.z + (res_fl4.z + bias_fl4.z) * mp_scale;
+        res_fl4.w = hs_fl4.w + attn_fl4.w + (res_fl4.w + bias_fl4.w) * mp_scale;
+
+        res_fl4_ptr[offset] = res_fl4;
     }
 }
 
-__global__ void gptj_residual_add(__half* input,
-                                  __half* output,
-                                  __half* attn,
-                                  __half* bias,
-                                  __half* attn_bias,
-                                  int total_count,
-                                  int intermediate_size,
-                                  float mp_size)
+__global__ void gptj_residual_add(__half* residual,
+                                  const __half* hidden_state,
+                                  const __half* attn,
+                                  const __half* bias,
+                                  const __half* attn_bias,
+                                  const int total_count,
+                                  const int intermediate_size,
+                                  const float mp_scale)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    float2* output_cast = reinterpret_cast<float2*>(output);
-    float2* attn_cast = reinterpret_cast<float2*>(attn);
-
-    float2* bias_cast = reinterpret_cast<float2*>(bias);
-    float2* attnbias_cast = reinterpret_cast<float2*>(attn_bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
+    float2* res_fl2_ptr = reinterpret_cast<float2*>(residual);
+    const float2* hs_fl2_ptr = reinterpret_cast<const float2*>(hidden_state);
+    const float2* attn_fl2_ptr = reinterpret_cast<const float2*>(attn);
+    const float2* bias_fl2_ptr = reinterpret_cast<const float2*>(bias);
+    const float2* attn_bias_fl2_ptr = reinterpret_cast<const float2*>(attn_bias);
+    const int offset = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 out_vec = output_cast[offset];
-        float2 res_vec = attn_cast[offset];
-
-        float2 bias_vec = bias_cast[offset % intermediate_size];
-        float2 attn_bias_vec = attnbias_cast[offset % intermediate_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* out_half = reinterpret_cast<__half2*>(&out_vec);
-        __half2* res_half = reinterpret_cast<__half2*>(&res_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-        __half2* attnbias_half = reinterpret_cast<__half2*>(&attn_bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_out = __half22float2(out_half[0]);
-        float2 high_out = __half22float2(out_half[1]);
-
-        float2 low_res = __half22float2(res_half[0]);
-        float2 high_res = __half22float2(res_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        float2 attn_low_bias = __half22float2(attnbias_half[0]);
-        float2 attn_high_bias = __half22float2(attnbias_half[1]);
-
-        low_data.x =
-            low_data.x * mp_size + (low_out.x + low_res.x + (low_bias.x + attn_low_bias.x));
-        low_data.y =
-            low_data.y * mp_size + (low_out.y + low_res.y + (low_bias.y + attn_low_bias.y));
-        high_data.x =
-            high_data.x * mp_size + (high_out.x + high_res.x + (high_bias.x + attn_high_bias.x));
-        high_data.y =
-            high_data.y * mp_size + (high_out.y + high_res.y + (high_bias.y + attn_high_bias.y));
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        output_cast[offset] = vals_vec;
+        float2 res_fl2 = res_fl2_ptr[offset];
+        const float2 hs_fl2 = hs_fl2_ptr[offset];
+        const float2 attn_fl2 = attn_fl2_ptr[offset];
+        const float2 bias_fl2 = bias_fl2_ptr[offset % intermediate_size];
+
+        __half2* res_half2 = reinterpret_cast<__half2*>(&res_fl2);
+        const __half2* hs_half2 = reinterpret_cast<const __half2*>(&hs_fl2);
+        const __half2* attn_half2 = reinterpret_cast<const __half2*>(&attn_fl2);
+        const __half2* bias_half2 = reinterpret_cast<const __half2*>(&bias_fl2);
+
+        float2 res_low = __half22float2(res_half2[0]);
+        float2 res_high = __half22float2(res_half2[1]);
+
+        const float2 hs_low = __half22float2(hs_half2[0]);
+        const float2 hs_high = __half22float2(hs_half2[1]);
+
+        const float2 attn_low = __half22float2(attn_half2[0]);
+        const float2 attn_high = __half22float2(attn_half2[1]);
+
+        const float2 bias_low = __half22float2(bias_half2[0]);
+        const float2 bias_high = __half22float2(bias_half2[1]);
+
+        if (attn_bias) {
+            const float2 attn_bias_fl2 = attn_bias_fl2_ptr[offset % intermediate_size];
+            const __half2* attn_bias_half2 = reinterpret_cast<const __half2*>(&attn_bias_fl2);
+            const float2 attn_bias_low = __half22float2(attn_bias_half2[0]);
+            const float2 attn_bias_high = __half22float2(attn_bias_half2[1]);
+            // residual += attention_bias
+            res_low.x += attn_bias_low.x;
+            res_low.y += attn_bias_low.y;
+            res_high.x += attn_bias_high.x;
+            res_high.y += attn_bias_high.y;
+        }
+        // residual = hidden_state + attention + (residual + bias) * mp_scale
+        res_low.x = attn_low.x + hs_low.x + (res_low.x + bias_low.x) * mp_scale;
+        res_low.y = attn_low.y + hs_low.y + (res_low.y + bias_low.y) * mp_scale;
+        res_high.x = attn_high.x + hs_high.x + (res_high.x + bias_high.x) * mp_scale;
+        res_high.y = attn_high.y + hs_high.y + (res_high.y + bias_high.y) * mp_scale;
+
+        res_half2[0] = __float22half2_rn(res_low);
+        res_half2[1] = __float22half2_rn(res_high);
+
+        res_fl2_ptr[offset] = res_fl2;
     }
-#endif
 }
 
 template <typename T>
-void launch_gptj_residual_add(T* input,
-                              T* output,
+void launch_gptj_residual_add(T* residual,
+                              T* hidden_state,
                               T* attn,
                               T* bias,
                               T* attn_bias,
@@ -412,7 +385,7 @@ void launch_gptj_residual_add(T* input,
     dim3 grid_dims((total_count - 1) / 1024 + 1);  // (batch_size);
 
    hipLaunchKernelGGL(( gptj_residual_add), dim3(grid_dims), dim3(block_dims), 0, stream, 
-        input, output, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size);
+        residual, hidden_state, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size);
 }
 
 template void launch_gptj_residual_add<float>(float*,
@@ -433,69 +406,33 @@ template void launch_gptj_residual_add<__half>(__half*,
                                                int,
                                                int,
                                                hipStream_t);
-
-__global__ void moe_res_matmul(float* residual,
-                               float* coef,
-                               float* mlp_out,
-                               int seq_len,
-                               int hidden_dim)
-{
-    unsigned tid = threadIdx.x;
-    float4* residual_cast = reinterpret_cast<float4*>(residual);
-    float4* coef_cast = reinterpret_cast<float4*>(coef);
-    float4* mlp_out_cast = reinterpret_cast<float4*>(mlp_out);
-
-    residual_cast += blockIdx.x * hidden_dim;
-    mlp_out_cast += blockIdx.x * hidden_dim;
-
-    float4* coef_cast2 = coef_cast + hidden_dim;
-
-    while (tid < hidden_dim) {
-        float4 res = residual_cast[tid];
-        float4 mlp = mlp_out_cast[tid];
-        float4 coef1 = coef_cast[tid];
-        float4 coef2 = coef_cast2[tid];
-        mlp.x = mlp.x * coef2.x + res.x * coef1.x;
-        mlp.y = mlp.y * coef2.y + res.y * coef1.y;
-        mlp.z = mlp.z * coef2.z + res.z * coef1.z;
-        mlp.w = mlp.w * coef2.w + res.w * coef1.w;
-        mlp_out_cast[tid] = mlp;
-        tid += blockDim.x;
-    }
-}
-
-__global__ void moe_res_matmul(__half* residual,
-                               __half* coef,
-                               __half* mlp_out,
-                               int seq_len,
-                               int hidden_dim)
+template <typename T>
+__global__ void moe_res_matmul(T* residual, T* coef, T* mlp_out, int seq_len, int hidden_dim)
 {
-    unsigned tid = threadIdx.x;
-
-    float2* residual_cast = reinterpret_cast<float2*>(residual);
-    float2* mlp_out_cast = reinterpret_cast<float2*>(mlp_out);
-    float2* coef_cast = reinterpret_cast<float2*>(coef);
-    float2* coef_cast2 = coef_cast + hidden_dim;
-
-    residual_cast += blockIdx.x * hidden_dim;
-    mlp_out_cast += blockIdx.x * hidden_dim;
-
-    while (tid < hidden_dim) {
-        float2 res = residual_cast[tid];
-        float2 coef1 = coef_cast[tid];
-        float2 coef2 = coef_cast[tid];
-        float2 data = mlp_out_cast[tid];
-        __half* data_h = reinterpret_cast<__half*>(&data);
-        __half* coef1_h = reinterpret_cast<__half*>(&coef1);
-        __half* coef2_h = reinterpret_cast<__half*>(&coef2);
-        __half* res_h = reinterpret_cast<__half*>(&res);
-        data_h[0] = res_h[0] * coef1_h[0] + data_h[0] * coef2_h[0];
-        data_h[1] = res_h[1] * coef1_h[1] + data_h[1] * coef2_h[1];
-        data_h[2] = res_h[2] * coef1_h[2] + data_h[2] * coef2_h[2];
-        data_h[3] = res_h[3] * coef1_h[3] + data_h[3] * coef2_h[3];
-
-        mlp_out_cast[tid] = data;
-        tid += blockDim.x;
+    constexpr int granularity = 16;
+    constexpr int vals_per_access = granularity / sizeof(T);
+
+    T* residual_seq = residual + blockIdx.x * hidden_dim;
+    T* mlp_out_seq = mlp_out + blockIdx.x * hidden_dim;
+
+    for (unsigned tid = threadIdx.x * vals_per_access; tid < hidden_dim;
+         tid += blockDim.x * vals_per_access) {
+        T mlp[vals_per_access];
+        T res[vals_per_access];
+        T coef1[vals_per_access];
+        T coef2[vals_per_access];
+
+        mem_access::load_global<granularity>(mlp, mlp_out_seq + tid);
+        mem_access::load_global<granularity>(res, residual_seq + tid);
+        mem_access::load_global<granularity>(coef1, coef + tid);
+        mem_access::load_global<granularity>(coef2, coef + tid + hidden_dim);
+
+#pragma unroll
+        for (int idx = 0; idx < vals_per_access; idx++) {
+            mlp[idx] = mlp[idx] * coef2[idx] + res[idx] * coef1[idx];
+        }
+
+        mem_access::store_global<granularity>(mlp_out_seq + tid, mlp);
     }
 }
 
@@ -510,7 +447,7 @@ void launch_moe_res_matmul(T* residual,
     dim3 grid_dim(seq_len);
     dim3 block_dim(1024);
    hipLaunchKernelGGL(( moe_res_matmul), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        residual, coef, mlp_out, seq_len, hidden_dim / 4);
+        residual, coef, mlp_out, seq_len, hidden_dim);
 }
 
 template void launch_moe_res_matmul(float* residual,
@@ -525,3 +462,224 @@ template void launch_moe_res_matmul(__half* residual,
                                     int seq_len,
                                     int hidden_dim,
                                     hipStream_t stream);
+
+__global__ void pad_data_kernel(__half* padded_output,
+                                __half* output,
+                                int head_size,
+                                int padded_head_size)
+{
+    float4* padded_output_cast = reinterpret_cast<float4*>(padded_output);
+    float4* output_cast = reinterpret_cast<float4*>(output);
+    int bid = blockIdx.x * (blockDim.y) + threadIdx.y;
+    int idx = threadIdx.x;
+    padded_output_cast += (bid * padded_head_size);
+    output_cast += (bid * head_size);
+    float4 ZERO;
+    const __half2 zero_h = __float2half2_rn(0.f);
+    __half2* ZERO_h = reinterpret_cast<__half2*>(&ZERO);
+#pragma unroll
+    for (int i = 0; i < 4; i++) ZERO_h[i] = zero_h;
+    if (idx < head_size)
+        padded_output_cast[idx] = output_cast[idx];
+    else
+        padded_output_cast[idx] = ZERO;
+}
+__global__ void pad_data_kernel(float* padded_output,
+                                float* output,
+                                int head_size,
+                                int padded_head_size)
+{
+}
+template <typename T>
+void pad_data(T* padded_output,
+              T* output,
+              int bsz,
+              int head_size,
+              int padded_head_size,
+              hipStream_t stream)
+{
+    dim3 grid_dim((bsz - 1) / 16 + 1);
+    dim3 block_dim(padded_head_size / 8, 16);
+   hipLaunchKernelGGL(( pad_data_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, 
+        padded_output, output, head_size / 8, padded_head_size / 8);
+}
+template void pad_data(__half* padded_output,
+                       __half* output,
+                       int bsz,
+                       int head_size,
+                       int padded_head_size,
+                       hipStream_t stream);
+template void pad_data(float* padded_output,
+                       float* output,
+                       int bsz,
+                       int head_size,
+                       int padded_head_size,
+                       hipStream_t stream);
+
+__global__ void pad_head_seq_kernel(__half* padded_output,
+                                    __half* output,
+                                    int seq_len,
+                                    int padded_seq_len,
+                                    int head_size,
+                                    int padded_head_size)
+{
+    float4* padded_output_cast = reinterpret_cast<float4*>(padded_output);
+    float4* output_cast = reinterpret_cast<float4*>(output);
+    int bsz = blockIdx.x;
+    int bid = blockIdx.y * (blockDim.y) + threadIdx.y;
+    int idx = threadIdx.x;
+    padded_output_cast += (bsz * padded_seq_len + bid) * padded_head_size;
+    output_cast += (bsz * seq_len + bid) * head_size;
+    float4 ZERO;
+    const __half2 zero_h = __float2half2_rn(0.f);
+    __half2* ZERO_h = reinterpret_cast<__half2*>(&ZERO);
+#pragma unroll
+    for (int i = 0; i < 4; i++) ZERO_h[i] = zero_h;
+
+    if (idx < head_size && bid < seq_len)
+        padded_output_cast[idx] = output_cast[idx];
+    else
+        padded_output_cast[idx] = ZERO;
+}
+__global__ void pad_head_seq_kernel(float* padded_output,
+                                    float* output,
+                                    int seq_len,
+                                    int padded_seq_len,
+                                    int head_size,
+                                    int padded_head_size)
+{
+}
+template <typename T>
+void pad_head_seq(T* padded_output,
+                  T* output,
+                  int bsz,
+                  int seq_len,
+                  int padded_seq_len,
+                  int head_size,
+                  int padded_head_size,
+                  hipStream_t stream)
+{
+    dim3 grid_dim(bsz, padded_seq_len / 16);
+    dim3 block_dim(padded_head_size / 8, 16);
+   hipLaunchKernelGGL(( pad_head_seq_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, 
+        padded_output, output, seq_len, padded_seq_len, head_size / 8, padded_head_size / 8);
+}
+template void pad_head_seq(__half* padded_output,
+                           __half* output,
+                           int bsz,
+                           int seq_len,
+                           int padded_seq_len,
+                           int head_size,
+                           int padded_head_size,
+                           hipStream_t stream);
+template void pad_head_seq(float* padded_output,
+                           float* output,
+                           int bsz,
+                           int seq_len,
+                           int padded_seq_len,
+                           int head_size,
+                           int padded_head_size,
+                           hipStream_t stream);
+
+// TODO(cmikeh2): evaluate different GeLU performance
+__device__ __forceinline__ float old_gelu(float val)
+{
+    // 1 / sqrt(2)
+    constexpr float rsqrt_2 = 0.707106769084930419922;
+    return val * 0.5f * (1.0f + erff(val * rsqrt_2));
+}
+
+namespace fused_geglu {
+constexpr int threads = 256;
+constexpr int steps = 2;
+constexpr int granularity = 16;
+}  // namespace fused_geglu
+
+template <typename T>
+__global__ void fused_bias_geglu(T* output,
+                                 const T* activation,
+                                 const T* bias,
+                                 int base_channels,
+                                 int total_elems)
+{
+    constexpr int T_per_access = fused_geglu::granularity / sizeof(T);
+    constexpr int T_per_step = T_per_access * fused_geglu::threads;
+    constexpr int T_per_block = T_per_step * fused_geglu::steps;
+
+    const int id = blockIdx.x * T_per_block + threadIdx.x * T_per_access;
+
+#pragma unroll
+    for (int i = 0; i < fused_geglu::steps; i++) {
+        T activation_buffer_1[T_per_access];
+        T activation_buffer_2[T_per_access];
+        T bias_buffer_1[T_per_access];
+        T bias_buffer_2[T_per_access];
+
+        const int iter_id = id + T_per_step * i;
+        if (iter_id < total_elems) {
+            const int channel_id = iter_id % base_channels;
+            const int seq_id = iter_id / base_channels;
+            const int seq_offset = seq_id * base_channels * 2;
+
+            mem_access::load_global<fused_geglu::granularity>(activation_buffer_1,
+                                                              activation + seq_offset + channel_id);
+            mem_access::load_global<fused_geglu::granularity>(
+                activation_buffer_2, activation + seq_offset + channel_id + base_channels);
+            mem_access::load_global<fused_geglu::granularity>(bias_buffer_1, bias + channel_id);
+            mem_access::load_global<fused_geglu::granularity>(bias_buffer_2,
+                                                              bias + channel_id + base_channels);
+
+            // Since the GeLU is going to happen at float, might as well
+            // convert
+#pragma unroll
+            for (int v = 0; v < T_per_access; v++) {
+                T hidden_state = activation_buffer_1[v] + bias_buffer_1[v];
+                T pre_gate = activation_buffer_2[v] + bias_buffer_2[v];
+                float gate_f = old_gelu(conversion::to<float>(pre_gate));
+                T gate = conversion::to<T>(gate_f);
+                activation_buffer_1[v] = hidden_state * gate;
+            }
+
+            mem_access::store_global<fused_geglu::granularity>(output + iter_id,
+                                                               activation_buffer_1);
+        }
+    }
+}
+
+template <typename T>
+void launch_fused_bias_geglu(T* output,
+                             const T* activation,
+                             const T* bias,
+                             int rows,
+                             int elems_per_row,
+                             hipStream_t stream)
+{
+    /*
+    Fused bias GEGLU is a variant of the gated activation functions.
+    The input here is a matrix of [batch, seq_len, 2 * intermediate_dim]
+    where the second half of the channels act as GeLU gates for the first
+    half.
+    */
+
+    // Re-derive the above figures
+    constexpr int T_per_access = fused_geglu::granularity / sizeof(T);
+    constexpr int T_per_step = T_per_access * fused_geglu::threads;
+    constexpr int T_per_block = T_per_step * fused_geglu::steps;
+
+    const int base_channels = elems_per_row / 2;
+    const int total_elems = base_channels * rows;
+
+    dim3 block(fused_geglu::threads);
+    dim3 grid((total_elems + T_per_block - 1) / T_per_block);
+
+   hipLaunchKernelGGL(( fused_bias_geglu), dim3(grid), dim3(block), 0, stream, 
+        output, activation, bias, base_channels, total_elems);
+}
+
+template void launch_fused_bias_geglu(__half*,
+                                      const __half*,
+                                      const __half*,
+                                      int,
+                                      int,
+                                      hipStream_t);
+template void launch_fused_bias_geglu(float*, const float*, const float*, int, int, hipStream_t);
diff --git a/csrc/transformer/inference/csrc/layer_norm.cu b/csrc/transformer/inference/csrc/layer_norm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..06078512cef5c808606144a36bfed7b41acc49bf
--- /dev/null
+++ b/csrc/transformer/inference/csrc/layer_norm.cu
@@ -0,0 +1,529 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#include "conversion_utils.h"
+#include "ds_kernel_utils.h"
+#include "inference_cuda_layers.h"
+#include "memory_access_utils.h"
+#include "reduction_utils.h"
+
+namespace cg = cooperative_groups;
+using rop = reduce::ROpType;
+
+namespace ln {
+constexpr int granularity = 16;
+}  // namespace ln
+
+/*
+Primary layer norm implementation. Assumes elems_per_row % 8
+is equal to 0.
+
+Args:
+    output: buffer for output data
+    vals: buffer for input data
+    gamma: gain for normalization
+    beta: bias for normalization
+    epsilon: numeric stability
+    elems_per_row: number of elements each block will normalize
+*/
+template <typename T, int unRoll, int threadsPerGroup, int maxThreads>
+__global__ void fused_ln(T* output,
+                         const T* vals,
+                         const T* gamma,
+                         const T* beta,
+                         float epsilon,
+                         int elems_per_row)
+{
+    constexpr int T_per_load = ln::granularity / sizeof(T);
+
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    // X-dimension of the block
+    const int block_offset = (tb.group_index().x * (maxThreads / threadsPerGroup) * elems_per_row) +
+                             (tb.thread_index().y * elems_per_row);
+    const int thread_offset = tb.thread_index().x * T_per_load;
+    const int base_offset = block_offset + thread_offset;
+    const int stride = tb.size() * T_per_load;
+
+    float sum = reduce::init<rop::Add, float>();
+
+    const T* input_base = vals + base_offset;
+
+    T local_buffer[unRoll * T_per_load];
+
+#pragma unRoll
+    for (int i = 0; i < unRoll; i++) {
+        T* iteration_buffer = local_buffer + i * T_per_load;
+        T residual_buffer[T_per_load];
+        T bias_buffer[T_per_load];
+
+        mem_access::load_global<ln::granularity>(
+            iteration_buffer, input_base + i * stride, thread_offset + i * stride < elems_per_row);
+
+#pragma unRoll
+        for (int j = 0; j < T_per_load; j++) {
+            float vals_up_cast = conversion::to<float>(iteration_buffer[j]);
+            sum = reduce::element<rop::Add>(sum, vals_up_cast);
+        }
+    }
+
+    reduce::partitioned_block<rop::Add, threadsPerGroup>(tb, warp, sum);
+    const float mean = sum / elems_per_row;
+
+    float mean_diff = reduce::init<rop::Add, float>();
+
+#pragma unRoll
+    for (int i = 0; i < unRoll; i++) {
+#pragma unRoll
+        for (int j = 0; j < T_per_load; j++) {
+            // Using a 0 value here skews the variance, have to if-guard
+            if (thread_offset + i * stride < elems_per_row) {
+                float diff = (conversion::to<float>(local_buffer[i * T_per_load + j]) - mean);
+                mean_diff = reduce::element<rop::Add>(mean_diff, diff * diff);
+            }
+        }
+    }
+
+    reduce::partitioned_block<rop::Add, threadsPerGroup>(tb, warp, mean_diff);
+    const float variance = mean_diff / elems_per_row;
+    const float denom = __frsqrt_rn(variance + epsilon);
+
+    const T mean_compute = conversion::to<T>(mean);
+    const T denom_compute = conversion::to<T>(denom);
+
+    T* block_output = output + block_offset;
+
+#pragma unRoll
+    for (int i = 0; i < unRoll; i++) {
+        T* iteration_buffer = local_buffer + i * T_per_load;
+        const int iter_idx = i * stride + thread_offset;
+        const bool do_loads = iter_idx < elems_per_row;
+
+        T gamma_local[T_per_load], beta_local[T_per_load];
+
+        mem_access::load_global<ln::granularity>(gamma_local, gamma + iter_idx, do_loads);
+        mem_access::load_global<ln::granularity>(beta_local, beta + iter_idx, do_loads);
+
+#pragma unRoll
+        for (int j = 0; j < T_per_load; j++) {
+            iteration_buffer[j] = (iteration_buffer[j] - mean_compute) * denom_compute;
+            iteration_buffer[j] = iteration_buffer[j] * gamma_local[j] + beta_local[j];
+        }
+
+        if (do_loads) {
+            mem_access::store_global<ln::granularity>(block_output + iter_idx, iteration_buffer);
+        }
+    }
+}
+
+#define LAUNCH_FUSED_LN(unRollFactor, threadsPerGroup, maxThreads) \
+    fused_ln<T, unRollFactor, threadsPerGroup, maxThreads>         \
+        <<<grid, block, 0, stream>>>(output, vals, gamma, beta, epsilon, elems_per_row);
+
+template <typename T>
+void launch_fused_ln(T* output,
+                     const T* vals,
+                     const T* gamma,
+                     const T* beta,
+                     float epsilon,
+                     int rows,
+                     int elems_per_row,
+                     cudaStream_t stream)
+{
+    // 8 for __half, 4 for float
+    constexpr int T_per_load = ln::granularity / sizeof(T);
+
+    constexpr int maxThreads = 256;
+
+    // For Flaoat, unRoll 4, for __half, unRoll 2
+    constexpr int internal_unRoll = sizeof(T) == 4 ? 4 : 2;
+
+    const bool is_subblock_schedule = (elems_per_row <= 128) ? true : false;
+    const int h_per_step = is_subblock_schedule ? T_per_load : T_per_load * internal_unRoll;
+
+    // Scheduling concern: may be slightly faster for some inputs to assign multiple stages of
+    // warp-sized blocks rather than stepping up to 64/96 threads
+    const int one_step_threads = next_pow2((elems_per_row + h_per_step - 1) / h_per_step);
+    const int threadsPerGroup = (one_step_threads < maxThreads) ? one_step_threads : maxThreads;
+
+    const int groups_per_block_max =
+        is_subblock_schedule ? (maxThreads + threadsPerGroup - 1) / threadsPerGroup : 1;
+    const int groups_per_block = (rows < groups_per_block_max) ? rows : groups_per_block_max;
+    const int groups_launch = (groups_per_block + rows - 1) / groups_per_block;
+
+    dim3 block(threadsPerGroup, groups_per_block);
+    dim3 grid(groups_launch);
+
+    const int elems_per_step = threadsPerGroup * h_per_step;
+    const int external_unRoll = (elems_per_row + elems_per_step - 1) / elems_per_step;
+
+    if (is_subblock_schedule) {
+        // <=128
+        if (threadsPerGroup == 1) {
+            LAUNCH_FUSED_LN(1, 1, maxThreads);
+        } else if (threadsPerGroup == 2) {
+            LAUNCH_FUSED_LN(1, 2, maxThreads);
+        } else if (threadsPerGroup == 4) {
+            LAUNCH_FUSED_LN(1, 4, maxThreads);
+        } else if (threadsPerGroup == 8) {
+            LAUNCH_FUSED_LN(1, 8, maxThreads);
+        } else if (threadsPerGroup == 16) {
+            LAUNCH_FUSED_LN(1, 16, maxThreads);
+        }
+    } else if (external_unRoll == 1) {
+        // 129 - 4096 elems
+        // (this can launch with 1-7 warps as well)
+        LAUNCH_FUSED_LN(1 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 2) {
+        // 4097 - 8192 elems
+        LAUNCH_FUSED_LN(2 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 3) {
+        // 8193 - 12288 elems
+        LAUNCH_FUSED_LN(3 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 4) {
+        // 12289 - 16384 elems
+        LAUNCH_FUSED_LN(4 * internal_unRoll, maxThreads, maxThreads);
+    }
+}
+
+template void launch_fused_ln(__half*,
+                              const __half*,
+                              const __half*,
+                              const __half*,
+                              float,
+                              int,
+                              int,
+                              cudaStream_t);
+template void
+launch_fused_ln(float*, const float*, const float*, const float*, float, int, int, cudaStream_t);
+
+/*
+Fused resiual + bias + layer norm implementation. Assumes elems_per_row % 8
+is equal to 0.
+
+TODO(cmikeh2): Goal is to deprecate this implementation. The bias + residual
+need to be fused into compute-bound producer operations.
+
+Args:
+    output: buffer for output data
+    res_output: output of residual addition
+    vals: buffer for input data
+    residual: residual data
+    bias: bias of of input data
+    gamma: gain for normalization
+    beta: bias for normalization
+    epsilon: numeric stability
+    elems_per_row: number of elements each block will normalize
+Template arg:
+    StoreResidual: controls whether the residual calculation is stored
+        or not. When set to false, the input `res_output` is unused.
+*/
+template <typename T, int unRoll, int threadsPerGroup, int maxThreads, bool preLnResidual>
+__global__ void fused_residual_ln(T* output,
+                                  T* res_output,
+                                  const T* vals,
+                                  const T* residual,
+                                  const T* bias,
+                                  const T* gamma,
+                                  const T* beta,
+                                  float epsilon,
+                                  int elems_per_row)
+{
+    constexpr int T_per_load = ln::granularity / sizeof(T);
+
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    // X-dimension of the block
+    const int block_offset = (tb.group_index().x * (maxThreads / threadsPerGroup) * elems_per_row) +
+                             (tb.thread_index().y * elems_per_row);
+    const int thread_offset = tb.thread_index().x * T_per_load;
+    const int base_offset = block_offset + thread_offset;
+    const int stride = tb.size() * T_per_load;
+
+    float sum = reduce::init<rop::Add, float>();
+
+    const T* input_base = vals + base_offset;
+    const T* residual_base = residual + base_offset;
+    const T* bias_base = bias + thread_offset;
+
+    T local_buffer[unRoll * T_per_load];
+
+    // Unlike a vanilla layernorm, since we're fusing the two adds as well
+    // an inner unRoll seems to be less valuable. If anything, a double unRoll
+    // makes the most sense if we find we are having performance issues.
+#pragma unRoll
+    for (int i = 0; i < unRoll; i++) {
+        T* iteration_buffer = local_buffer + i * T_per_load;
+        T residual_buffer[T_per_load];
+        T bias_buffer[T_per_load];
+
+        mem_access::load_global<ln::granularity>(
+            iteration_buffer, input_base + i * stride, thread_offset + i * stride < elems_per_row);
+        mem_access::load_global<ln::granularity>(residual_buffer,
+                                                 residual_base + i * stride,
+                                                 thread_offset + i * stride < elems_per_row);
+        mem_access::load_global<ln::granularity>(
+            bias_buffer, bias_base + i * stride, thread_offset + i * stride < elems_per_row);
+
+#pragma unRoll
+        for (int j = 0; j < T_per_load; j++) {
+            float vals_up_cast = conversion::to<float>(iteration_buffer[j]);
+            float res_up_cast = conversion::to<float>(residual_buffer[j]);
+            float bias_up_cast = conversion::to<float>(bias_buffer[j]);
+            vals_up_cast += res_up_cast + bias_up_cast;
+            sum = reduce::element<rop::Add>(sum, vals_up_cast);
+            iteration_buffer[j] = conversion::to<T>(vals_up_cast);
+        }
+
+        if (preLnResidual && (thread_offset + i * stride < elems_per_row)) {
+            mem_access::store_global<ln::granularity>(res_output + base_offset + i * stride,
+                                                      iteration_buffer);
+        }
+    }
+
+    reduce::partitioned_block<rop::Add, threadsPerGroup>(tb, warp, sum);
+    const float mean = sum / elems_per_row;
+
+    float mean_diff = reduce::init<rop::Add, float>();
+#pragma unRoll
+    for (int i = 0; i < unRoll; i++) {
+#pragma unRoll
+        for (int j = 0; j < T_per_load; j++) {
+            // Using a 0 value here skews the variance, have to if-guard
+            if (thread_offset + i * stride < elems_per_row) {
+                float diff = (conversion::to<float>(local_buffer[i * T_per_load + j]) - mean);
+                mean_diff = reduce::element<rop::Add>(mean_diff, diff * diff);
+            }
+        }
+    }
+
+    reduce::partitioned_block<rop::Add, threadsPerGroup>(tb, warp, mean_diff);
+    const float variance = mean_diff / elems_per_row;
+    const float denom = __frsqrt_rn(variance + epsilon);
+
+    const T mean_compute = conversion::to<T>(mean);
+    const T denom_compute = conversion::to<T>(denom);
+
+    T* block_output = output + block_offset;
+
+#pragma unRoll
+    for (int i = 0; i < unRoll; i++) {
+        T* iteration_buffer = local_buffer + i * T_per_load;
+        const int iter_idx = i * stride + thread_offset;
+        const bool do_loads = iter_idx < elems_per_row;
+
+        T gamma_local[T_per_load], beta_local[T_per_load];
+
+        mem_access::load_global<ln::granularity>(gamma_local, gamma + iter_idx, do_loads);
+        mem_access::load_global<ln::granularity>(beta_local, beta + iter_idx, do_loads);
+
+#pragma unRoll
+        for (int j = 0; j < T_per_load; j++) {
+            iteration_buffer[j] = (iteration_buffer[j] - mean_compute) * denom_compute;
+            iteration_buffer[j] = iteration_buffer[j] * gamma_local[j] + beta_local[j];
+        }
+
+        if (do_loads) {
+            mem_access::store_global<ln::granularity>(block_output + iter_idx, iteration_buffer);
+        }
+    }
+}
+
+// TODO(cmikeh2): There's a bunch of redundancy here that needs to be removed/simplified.
+#define LAUNCH_FUSED_RES_LN(unRollFactor, threadsPerGroup, maxThreads)     \
+    fused_residual_ln<T, unRollFactor, threadsPerGroup, maxThreads, false> \
+        <<<grid, block, 0, stream>>>(                                      \
+            output, nullptr, vals, residual, bias, gamma, beta, epsilon, elems_per_row);
+
+template <typename T>
+void launch_fused_residual_ln(T* output,
+                              const T* vals,
+                              const T* residual,
+                              const T* bias,
+                              const T* gamma,
+                              const T* beta,
+                              float epsilon,
+                              int rows,
+                              int elems_per_row,
+                              cudaStream_t stream)
+{
+    // 8 for __half, 4 for float
+    constexpr int T_per_load = ln::granularity / sizeof(T);
+
+    constexpr int maxThreads = 256;
+
+    // For Flaoat, unRoll 4, for __half, unRoll 2
+    constexpr int internal_unRoll = sizeof(T) == 4 ? 4 : 2;
+
+    const bool is_subblock_schedule = (elems_per_row <= 128) ? true : false;
+    const int h_per_step = is_subblock_schedule ? T_per_load : T_per_load * internal_unRoll;
+
+    // Scheduling concern: may be slightly faster for some inputs to assign multiple stages of
+    // warp-sized blocks rather than stepping up to 64/96 threads
+    const int one_step_threads = next_pow2((elems_per_row + h_per_step - 1) / h_per_step);
+    const int threadsPerGroup = (one_step_threads < maxThreads) ? one_step_threads : maxThreads;
+
+    const int groups_per_block_max =
+        is_subblock_schedule ? (maxThreads + threadsPerGroup - 1) / threadsPerGroup : 1;
+    const int groups_per_block = (rows < groups_per_block_max) ? rows : groups_per_block_max;
+    const int groups_launch = (groups_per_block + rows - 1) / groups_per_block;
+
+    dim3 block(threadsPerGroup, groups_per_block);
+    dim3 grid(groups_launch);
+
+    const int elems_per_step = threadsPerGroup * h_per_step;
+    const int external_unRoll = (elems_per_row + elems_per_step - 1) / elems_per_step;
+
+    if (is_subblock_schedule) {
+        // <=128
+        if (threadsPerGroup == 1) {
+            LAUNCH_FUSED_RES_LN(1, 1, maxThreads);
+        } else if (threadsPerGroup == 2) {
+            LAUNCH_FUSED_RES_LN(1, 2, maxThreads);
+        } else if (threadsPerGroup == 4) {
+            LAUNCH_FUSED_RES_LN(1, 4, maxThreads);
+        } else if (threadsPerGroup == 8) {
+            LAUNCH_FUSED_RES_LN(1, 8, maxThreads);
+        } else if (threadsPerGroup == 16) {
+            LAUNCH_FUSED_RES_LN(1, 16, maxThreads);
+        }
+    } else if (external_unRoll == 1) {
+        // 129 - 4096 elems
+        // (this can launch with 1-7 warps as well)
+        LAUNCH_FUSED_RES_LN(1 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 2) {
+        // 4097 - 8192 elems
+        LAUNCH_FUSED_RES_LN(2 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 3) {
+        // 8193 - 12288 elems
+        LAUNCH_FUSED_RES_LN(3 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 4) {
+        // 12289 - 16384 elems
+        LAUNCH_FUSED_RES_LN(4 * internal_unRoll, maxThreads, maxThreads);
+    }
+}
+
+#define LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(unRollFactor, threadsPerGroup, maxThreads) \
+    fused_residual_ln<T, unRollFactor, threadsPerGroup, maxThreads, true>               \
+        <<<grid, block, 0, stream>>>(                                                   \
+            norm_output, res_output, vals, residual, bias, gamma, beta, epsilon, elems_per_row);
+
+template <typename T>
+void launch_fused_residual_ln_store_pre_ln_res(T* norm_output,
+                                               T* res_output,
+                                               const T* vals,
+                                               const T* residual,
+                                               const T* bias,
+                                               const T* gamma,
+                                               const T* beta,
+                                               float epsilon,
+                                               int rows,
+                                               int elems_per_row,
+                                               cudaStream_t stream)
+{
+    // 8 for __half, 4 for float
+    constexpr int T_per_load = ln::granularity / sizeof(T);
+
+    constexpr int maxThreads = 256;
+
+    // For Flaoat, unRoll 4, for __half, unRoll 2
+    constexpr int internal_unRoll = sizeof(T) == 4 ? 4 : 2;
+
+    const bool is_subblock_schedule = (elems_per_row <= 128) ? true : false;
+    const int h_per_step = is_subblock_schedule ? T_per_load : T_per_load * internal_unRoll;
+
+    // Scheduling concern: may be slightly faster for some inputs to assign multiple stages of
+    // warp-sized blocks rather than stepping up to 64/96 threads
+    const int one_step_threads = next_pow2((elems_per_row + h_per_step - 1) / h_per_step);
+    const int threadsPerGroup = (one_step_threads < maxThreads) ? one_step_threads : maxThreads;
+
+    const int groups_per_block_max =
+        is_subblock_schedule ? (maxThreads + threadsPerGroup - 1) / threadsPerGroup : 1;
+    const int groups_per_block = (rows < groups_per_block_max) ? rows : groups_per_block_max;
+    const int groups_launch = (groups_per_block + rows - 1) / groups_per_block;
+
+    dim3 block(threadsPerGroup, groups_per_block);
+    dim3 grid(groups_launch);
+
+    const int elems_per_step = threadsPerGroup * h_per_step;
+    const int external_unRoll = (elems_per_row + elems_per_step - 1) / elems_per_step;
+
+    if (is_subblock_schedule) {
+        // <=128
+        if (threadsPerGroup == 1) {
+            LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(1, 1, maxThreads);
+        } else if (threadsPerGroup == 2) {
+            LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(1, 2, maxThreads);
+        } else if (threadsPerGroup == 4) {
+            LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(1, 4, maxThreads);
+        } else if (threadsPerGroup == 8) {
+            LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(1, 8, maxThreads);
+        } else if (threadsPerGroup == 16) {
+            LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(1, 16, maxThreads);
+        }
+    } else if (external_unRoll == 1) {
+        // 129 - 4096 elems
+        // (this can launch with 1-7 warps as well)
+        LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(1 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 2) {
+        // 4097 - 8192 elems
+        LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(2 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 3) {
+        // 8193 - 12288 elems
+        LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(3 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 4) {
+        // 12289 - 16384 elems
+        LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(4 * internal_unRoll, maxThreads, maxThreads);
+    }
+}
+
+// No-store specializations
+template void launch_fused_residual_ln(__half*,
+                                       const __half*,
+                                       const __half*,
+                                       const __half*,
+                                       const __half*,
+                                       const __half*,
+                                       float,
+                                       int,
+                                       int,
+                                       cudaStream_t);
+
+template void launch_fused_residual_ln(float*,
+                                       const float*,
+                                       const float*,
+                                       const float*,
+                                       const float*,
+                                       const float*,
+                                       float,
+                                       int,
+                                       int,
+                                       cudaStream_t);
+
+// Store specializations
+template void launch_fused_residual_ln_store_pre_ln_res(__half*,
+                                                        __half*,
+                                                        const __half*,
+                                                        const __half*,
+                                                        const __half*,
+                                                        const __half*,
+                                                        const __half*,
+                                                        float,
+                                                        int,
+                                                        int,
+                                                        cudaStream_t);
+
+template void launch_fused_residual_ln_store_pre_ln_res(float*,
+                                                        float*,
+                                                        const float*,
+                                                        const float*,
+                                                        const float*,
+                                                        const float*,
+                                                        const float*,
+                                                        float,
+                                                        int,
+                                                        int,
+                                                        cudaStream_t);
diff --git a/csrc/transformer/inference/csrc/layer_norm.hip b/csrc/transformer/inference/csrc/layer_norm.hip
new file mode 100644
index 0000000000000000000000000000000000000000..785521771da1f65cc3948be18edecde0c5feb382
--- /dev/null
+++ b/csrc/transformer/inference/csrc/layer_norm.hip
@@ -0,0 +1,531 @@
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#include "conversion_utils.h"
+#include "ds_kernel_utils.h"
+#include "inference_cuda_layers.h"
+#include "memory_access_utils.h"
+#include "reduction_utils.h"
+
+namespace cg = cooperative_groups;
+using rop = reduce::ROpType;
+
+namespace ln {
+constexpr int granularity = 16;
+}  // namespace ln
+
+/*
+Primary layer norm implementation. Assumes elems_per_row % 8
+is equal to 0.
+
+Args:
+    output: buffer for output data
+    vals: buffer for input data
+    gamma: gain for normalization
+    beta: bias for normalization
+    epsilon: numeric stability
+    elems_per_row: number of elements each block will normalize
+*/
+template <typename T, int unRoll, int threadsPerGroup, int maxThreads>
+__global__ void fused_ln(T* output,
+                         const T* vals,
+                         const T* gamma,
+                         const T* beta,
+                         float epsilon,
+                         int elems_per_row)
+{
+    constexpr int T_per_load = ln::granularity / sizeof(T);
+
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    // X-dimension of the block
+    const int block_offset = (tb.group_index().x * (maxThreads / threadsPerGroup) * elems_per_row) +
+                             (tb.thread_index().y * elems_per_row);
+    const int thread_offset = tb.thread_index().x * T_per_load;
+    const int base_offset = block_offset + thread_offset;
+    const int stride = tb.size() * T_per_load;
+
+    float sum = reduce::init<rop::Add, float>();
+
+    const T* input_base = vals + base_offset;
+
+    T local_buffer[unRoll * T_per_load];
+
+#pragma unRoll
+    for (int i = 0; i < unRoll; i++) {
+        T* iteration_buffer = local_buffer + i * T_per_load;
+        T residual_buffer[T_per_load];
+        T bias_buffer[T_per_load];
+
+        mem_access::load_global<ln::granularity>(
+            iteration_buffer, input_base + i * stride, thread_offset + i * stride < elems_per_row);
+
+#pragma unRoll
+        for (int j = 0; j < T_per_load; j++) {
+            float vals_up_cast = conversion::to<float>(iteration_buffer[j]);
+            sum = reduce::element<rop::Add>(sum, vals_up_cast);
+        }
+    }
+
+    reduce::partitioned_block<rop::Add, threadsPerGroup>(tb, warp, sum);
+    const float mean = sum / elems_per_row;
+
+    float mean_diff = reduce::init<rop::Add, float>();
+
+#pragma unRoll
+    for (int i = 0; i < unRoll; i++) {
+#pragma unRoll
+        for (int j = 0; j < T_per_load; j++) {
+            // Using a 0 value here skews the variance, have to if-guard
+            if (thread_offset + i * stride < elems_per_row) {
+                float diff = (conversion::to<float>(local_buffer[i * T_per_load + j]) - mean);
+                mean_diff = reduce::element<rop::Add>(mean_diff, diff * diff);
+            }
+        }
+    }
+
+    reduce::partitioned_block<rop::Add, threadsPerGroup>(tb, warp, mean_diff);
+    const float variance = mean_diff / elems_per_row;
+    const float denom = __frsqrt_rn(variance + epsilon);
+
+    const T mean_compute = conversion::to<T>(mean);
+    const T denom_compute = conversion::to<T>(denom);
+
+    T* block_output = output + block_offset;
+
+#pragma unRoll
+    for (int i = 0; i < unRoll; i++) {
+        T* iteration_buffer = local_buffer + i * T_per_load;
+        const int iter_idx = i * stride + thread_offset;
+        const bool do_loads = iter_idx < elems_per_row;
+
+        T gamma_local[T_per_load], beta_local[T_per_load];
+
+        mem_access::load_global<ln::granularity>(gamma_local, gamma + iter_idx, do_loads);
+        mem_access::load_global<ln::granularity>(beta_local, beta + iter_idx, do_loads);
+
+#pragma unRoll
+        for (int j = 0; j < T_per_load; j++) {
+            iteration_buffer[j] = (iteration_buffer[j] - mean_compute) * denom_compute;
+            iteration_buffer[j] = iteration_buffer[j] * gamma_local[j] + beta_local[j];
+        }
+
+        if (do_loads) {
+            mem_access::store_global<ln::granularity>(block_output + iter_idx, iteration_buffer);
+        }
+    }
+}
+
+#define LAUNCH_FUSED_LN(unRollFactor, threadsPerGroup, maxThreads) \
+   hipLaunchKernelGGL(( fused_ln<T, unRollFactor, threadsPerGroup, maxThreads>)         \
+        , dim3(grid), dim3(block), 0, stream, output, vals, gamma, beta, epsilon, elems_per_row);
+
+template <typename T>
+void launch_fused_ln(T* output,
+                     const T* vals,
+                     const T* gamma,
+                     const T* beta,
+                     float epsilon,
+                     int rows,
+                     int elems_per_row,
+                     hipStream_t stream)
+{
+    // 8 for __half, 4 for float
+    constexpr int T_per_load = ln::granularity / sizeof(T);
+
+    constexpr int maxThreads = 256;
+
+    // For Flaoat, unRoll 4, for __half, unRoll 2
+    constexpr int internal_unRoll = sizeof(T) == 4 ? 4 : 2;
+
+    const bool is_subblock_schedule = (elems_per_row <= 128) ? true : false;
+    const int h_per_step = is_subblock_schedule ? T_per_load : T_per_load * internal_unRoll;
+
+    // Scheduling concern: may be slightly faster for some inputs to assign multiple stages of
+    // warp-sized blocks rather than stepping up to 64/96 threads
+    const int one_step_threads = next_pow2((elems_per_row + h_per_step - 1) / h_per_step);
+    const int threadsPerGroup = (one_step_threads < maxThreads) ? one_step_threads : maxThreads;
+
+    const int groups_per_block_max =
+        is_subblock_schedule ? (maxThreads + threadsPerGroup - 1) / threadsPerGroup : 1;
+    const int groups_per_block = (rows < groups_per_block_max) ? rows : groups_per_block_max;
+    const int groups_launch = (groups_per_block + rows - 1) / groups_per_block;
+
+    dim3 block(threadsPerGroup, groups_per_block);
+    dim3 grid(groups_launch);
+
+    const int elems_per_step = threadsPerGroup * h_per_step;
+    const int external_unRoll = (elems_per_row + elems_per_step - 1) / elems_per_step;
+
+    if (is_subblock_schedule) {
+        // <=128
+        if (threadsPerGroup == 1) {
+            LAUNCH_FUSED_LN(1, 1, maxThreads);
+        } else if (threadsPerGroup == 2) {
+            LAUNCH_FUSED_LN(1, 2, maxThreads);
+        } else if (threadsPerGroup == 4) {
+            LAUNCH_FUSED_LN(1, 4, maxThreads);
+        } else if (threadsPerGroup == 8) {
+            LAUNCH_FUSED_LN(1, 8, maxThreads);
+        } else if (threadsPerGroup == 16) {
+            LAUNCH_FUSED_LN(1, 16, maxThreads);
+        }
+    } else if (external_unRoll == 1) {
+        // 129 - 4096 elems
+        // (this can launch with 1-7 warps as well)
+        LAUNCH_FUSED_LN(1 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 2) {
+        // 4097 - 8192 elems
+        LAUNCH_FUSED_LN(2 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 3) {
+        // 8193 - 12288 elems
+        LAUNCH_FUSED_LN(3 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 4) {
+        // 12289 - 16384 elems
+        LAUNCH_FUSED_LN(4 * internal_unRoll, maxThreads, maxThreads);
+    }
+}
+
+template void launch_fused_ln(__half*,
+                              const __half*,
+                              const __half*,
+                              const __half*,
+                              float,
+                              int,
+                              int,
+                              hipStream_t);
+template void
+launch_fused_ln(float*, const float*, const float*, const float*, float, int, int, hipStream_t);
+
+/*
+Fused resiual + bias + layer norm implementation. Assumes elems_per_row % 8
+is equal to 0.
+
+TODO(cmikeh2): Goal is to deprecate this implementation. The bias + residual
+need to be fused into compute-bound producer operations.
+
+Args:
+    output: buffer for output data
+    res_output: output of residual addition
+    vals: buffer for input data
+    residual: residual data
+    bias: bias of of input data
+    gamma: gain for normalization
+    beta: bias for normalization
+    epsilon: numeric stability
+    elems_per_row: number of elements each block will normalize
+Template arg:
+    StoreResidual: controls whether the residual calculation is stored
+        or not. When set to false, the input `res_output` is unused.
+*/
+template <typename T, int unRoll, int threadsPerGroup, int maxThreads, bool preLnResidual>
+__global__ void fused_residual_ln(T* output,
+                                  T* res_output,
+                                  const T* vals,
+                                  const T* residual,
+                                  const T* bias,
+                                  const T* gamma,
+                                  const T* beta,
+                                  float epsilon,
+                                  int elems_per_row)
+{
+    constexpr int T_per_load = ln::granularity / sizeof(T);
+
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    // X-dimension of the block
+    const int block_offset = (tb.group_index().x * (maxThreads / threadsPerGroup) * elems_per_row) +
+                             (tb.thread_index().y * elems_per_row);
+    const int thread_offset = tb.thread_index().x * T_per_load;
+    const int base_offset = block_offset + thread_offset;
+    const int stride = tb.size() * T_per_load;
+
+    float sum = reduce::init<rop::Add, float>();
+
+    const T* input_base = vals + base_offset;
+    const T* residual_base = residual + base_offset;
+    const T* bias_base = bias + thread_offset;
+
+    T local_buffer[unRoll * T_per_load];
+
+    // Unlike a vanilla layernorm, since we're fusing the two adds as well
+    // an inner unRoll seems to be less valuable. If anything, a double unRoll
+    // makes the most sense if we find we are having performance issues.
+#pragma unRoll
+    for (int i = 0; i < unRoll; i++) {
+        T* iteration_buffer = local_buffer + i * T_per_load;
+        T residual_buffer[T_per_load];
+        T bias_buffer[T_per_load];
+
+        mem_access::load_global<ln::granularity>(
+            iteration_buffer, input_base + i * stride, thread_offset + i * stride < elems_per_row);
+        mem_access::load_global<ln::granularity>(residual_buffer,
+                                                 residual_base + i * stride,
+                                                 thread_offset + i * stride < elems_per_row);
+        mem_access::load_global<ln::granularity>(
+            bias_buffer, bias_base + i * stride, thread_offset + i * stride < elems_per_row);
+
+#pragma unRoll
+        for (int j = 0; j < T_per_load; j++) {
+            float vals_up_cast = conversion::to<float>(iteration_buffer[j]);
+            float res_up_cast = conversion::to<float>(residual_buffer[j]);
+            float bias_up_cast = conversion::to<float>(bias_buffer[j]);
+            vals_up_cast += res_up_cast + bias_up_cast;
+            sum = reduce::element<rop::Add>(sum, vals_up_cast);
+            iteration_buffer[j] = conversion::to<T>(vals_up_cast);
+        }
+
+        if (preLnResidual && (thread_offset + i * stride < elems_per_row)) {
+            mem_access::store_global<ln::granularity>(res_output + base_offset + i * stride,
+                                                      iteration_buffer);
+        }
+    }
+
+    reduce::partitioned_block<rop::Add, threadsPerGroup>(tb, warp, sum);
+    const float mean = sum / elems_per_row;
+
+    float mean_diff = reduce::init<rop::Add, float>();
+#pragma unRoll
+    for (int i = 0; i < unRoll; i++) {
+#pragma unRoll
+        for (int j = 0; j < T_per_load; j++) {
+            // Using a 0 value here skews the variance, have to if-guard
+            if (thread_offset + i * stride < elems_per_row) {
+                float diff = (conversion::to<float>(local_buffer[i * T_per_load + j]) - mean);
+                mean_diff = reduce::element<rop::Add>(mean_diff, diff * diff);
+            }
+        }
+    }
+
+    reduce::partitioned_block<rop::Add, threadsPerGroup>(tb, warp, mean_diff);
+    const float variance = mean_diff / elems_per_row;
+    const float denom = __frsqrt_rn(variance + epsilon);
+
+    const T mean_compute = conversion::to<T>(mean);
+    const T denom_compute = conversion::to<T>(denom);
+
+    T* block_output = output + block_offset;
+
+#pragma unRoll
+    for (int i = 0; i < unRoll; i++) {
+        T* iteration_buffer = local_buffer + i * T_per_load;
+        const int iter_idx = i * stride + thread_offset;
+        const bool do_loads = iter_idx < elems_per_row;
+
+        T gamma_local[T_per_load], beta_local[T_per_load];
+
+        mem_access::load_global<ln::granularity>(gamma_local, gamma + iter_idx, do_loads);
+        mem_access::load_global<ln::granularity>(beta_local, beta + iter_idx, do_loads);
+
+#pragma unRoll
+        for (int j = 0; j < T_per_load; j++) {
+            iteration_buffer[j] = (iteration_buffer[j] - mean_compute) * denom_compute;
+            iteration_buffer[j] = iteration_buffer[j] * gamma_local[j] + beta_local[j];
+        }
+
+        if (do_loads) {
+            mem_access::store_global<ln::granularity>(block_output + iter_idx, iteration_buffer);
+        }
+    }
+}
+
+// TODO(cmikeh2): There's a bunch of redundancy here that needs to be removed/simplified.
+#define LAUNCH_FUSED_RES_LN(unRollFactor, threadsPerGroup, maxThreads)     \
+   hipLaunchKernelGGL(( fused_residual_ln<T, unRollFactor, threadsPerGroup, maxThreads, false>) \
+        , dim3(grid), dim3(block), 0, stream,                                       \
+            output, nullptr, vals, residual, bias, gamma, beta, epsilon, elems_per_row);
+
+template <typename T>
+void launch_fused_residual_ln(T* output,
+                              const T* vals,
+                              const T* residual,
+                              const T* bias,
+                              const T* gamma,
+                              const T* beta,
+                              float epsilon,
+                              int rows,
+                              int elems_per_row,
+                              hipStream_t stream)
+{
+    // 8 for __half, 4 for float
+    constexpr int T_per_load = ln::granularity / sizeof(T);
+
+    constexpr int maxThreads = 256;
+
+    // For Flaoat, unRoll 4, for __half, unRoll 2
+    constexpr int internal_unRoll = sizeof(T) == 4 ? 4 : 2;
+
+    const bool is_subblock_schedule = (elems_per_row <= 128) ? true : false;
+    const int h_per_step = is_subblock_schedule ? T_per_load : T_per_load * internal_unRoll;
+
+    // Scheduling concern: may be slightly faster for some inputs to assign multiple stages of
+    // warp-sized blocks rather than stepping up to 64/96 threads
+    const int one_step_threads = next_pow2((elems_per_row + h_per_step - 1) / h_per_step);
+    const int threadsPerGroup = (one_step_threads < maxThreads) ? one_step_threads : maxThreads;
+
+    const int groups_per_block_max =
+        is_subblock_schedule ? (maxThreads + threadsPerGroup - 1) / threadsPerGroup : 1;
+    const int groups_per_block = (rows < groups_per_block_max) ? rows : groups_per_block_max;
+    const int groups_launch = (groups_per_block + rows - 1) / groups_per_block;
+
+    dim3 block(threadsPerGroup, groups_per_block);
+    dim3 grid(groups_launch);
+
+    const int elems_per_step = threadsPerGroup * h_per_step;
+    const int external_unRoll = (elems_per_row + elems_per_step - 1) / elems_per_step;
+
+    if (is_subblock_schedule) {
+        // <=128
+        if (threadsPerGroup == 1) {
+            LAUNCH_FUSED_RES_LN(1, 1, maxThreads);
+        } else if (threadsPerGroup == 2) {
+            LAUNCH_FUSED_RES_LN(1, 2, maxThreads);
+        } else if (threadsPerGroup == 4) {
+            LAUNCH_FUSED_RES_LN(1, 4, maxThreads);
+        } else if (threadsPerGroup == 8) {
+            LAUNCH_FUSED_RES_LN(1, 8, maxThreads);
+        } else if (threadsPerGroup == 16) {
+            LAUNCH_FUSED_RES_LN(1, 16, maxThreads);
+        }
+    } else if (external_unRoll == 1) {
+        // 129 - 4096 elems
+        // (this can launch with 1-7 warps as well)
+        LAUNCH_FUSED_RES_LN(1 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 2) {
+        // 4097 - 8192 elems
+        LAUNCH_FUSED_RES_LN(2 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 3) {
+        // 8193 - 12288 elems
+        LAUNCH_FUSED_RES_LN(3 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 4) {
+        // 12289 - 16384 elems
+        LAUNCH_FUSED_RES_LN(4 * internal_unRoll, maxThreads, maxThreads);
+    }
+}
+
+#define LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(unRollFactor, threadsPerGroup, maxThreads) \
+   hipLaunchKernelGGL(( fused_residual_ln<T, unRollFactor, threadsPerGroup, maxThreads, true>)               \
+        , dim3(grid), dim3(block), 0, stream,                                                    \
+            norm_output, res_output, vals, residual, bias, gamma, beta, epsilon, elems_per_row);
+
+template <typename T>
+void launch_fused_residual_ln_store_pre_ln_res(T* norm_output,
+                                               T* res_output,
+                                               const T* vals,
+                                               const T* residual,
+                                               const T* bias,
+                                               const T* gamma,
+                                               const T* beta,
+                                               float epsilon,
+                                               int rows,
+                                               int elems_per_row,
+                                               hipStream_t stream)
+{
+    // 8 for __half, 4 for float
+    constexpr int T_per_load = ln::granularity / sizeof(T);
+
+    constexpr int maxThreads = 256;
+
+    // For Flaoat, unRoll 4, for __half, unRoll 2
+    constexpr int internal_unRoll = sizeof(T) == 4 ? 4 : 2;
+
+    const bool is_subblock_schedule = (elems_per_row <= 128) ? true : false;
+    const int h_per_step = is_subblock_schedule ? T_per_load : T_per_load * internal_unRoll;
+
+    // Scheduling concern: may be slightly faster for some inputs to assign multiple stages of
+    // warp-sized blocks rather than stepping up to 64/96 threads
+    const int one_step_threads = next_pow2((elems_per_row + h_per_step - 1) / h_per_step);
+    const int threadsPerGroup = (one_step_threads < maxThreads) ? one_step_threads : maxThreads;
+
+    const int groups_per_block_max =
+        is_subblock_schedule ? (maxThreads + threadsPerGroup - 1) / threadsPerGroup : 1;
+    const int groups_per_block = (rows < groups_per_block_max) ? rows : groups_per_block_max;
+    const int groups_launch = (groups_per_block + rows - 1) / groups_per_block;
+
+    dim3 block(threadsPerGroup, groups_per_block);
+    dim3 grid(groups_launch);
+
+    const int elems_per_step = threadsPerGroup * h_per_step;
+    const int external_unRoll = (elems_per_row + elems_per_step - 1) / elems_per_step;
+
+    if (is_subblock_schedule) {
+        // <=128
+        if (threadsPerGroup == 1) {
+            LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(1, 1, maxThreads);
+        } else if (threadsPerGroup == 2) {
+            LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(1, 2, maxThreads);
+        } else if (threadsPerGroup == 4) {
+            LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(1, 4, maxThreads);
+        } else if (threadsPerGroup == 8) {
+            LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(1, 8, maxThreads);
+        } else if (threadsPerGroup == 16) {
+            LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(1, 16, maxThreads);
+        }
+    } else if (external_unRoll == 1) {
+        // 129 - 4096 elems
+        // (this can launch with 1-7 warps as well)
+        LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(1 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 2) {
+        // 4097 - 8192 elems
+        LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(2 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 3) {
+        // 8193 - 12288 elems
+        LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(3 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 4) {
+        // 12289 - 16384 elems
+        LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(4 * internal_unRoll, maxThreads, maxThreads);
+    }
+}
+
+// No-store specializations
+template void launch_fused_residual_ln(__half*,
+                                       const __half*,
+                                       const __half*,
+                                       const __half*,
+                                       const __half*,
+                                       const __half*,
+                                       float,
+                                       int,
+                                       int,
+                                       hipStream_t);
+
+template void launch_fused_residual_ln(float*,
+                                       const float*,
+                                       const float*,
+                                       const float*,
+                                       const float*,
+                                       const float*,
+                                       float,
+                                       int,
+                                       int,
+                                       hipStream_t);
+
+// Store specializations
+template void launch_fused_residual_ln_store_pre_ln_res(__half*,
+                                                        __half*,
+                                                        const __half*,
+                                                        const __half*,
+                                                        const __half*,
+                                                        const __half*,
+                                                        const __half*,
+                                                        float,
+                                                        int,
+                                                        int,
+                                                        hipStream_t);
+
+template void launch_fused_residual_ln_store_pre_ln_res(float*,
+                                                        float*,
+                                                        const float*,
+                                                        const float*,
+                                                        const float*,
+                                                        const float*,
+                                                        const float*,
+                                                        float,
+                                                        int,
+                                                        int,
+                                                        hipStream_t);
diff --git a/csrc/transformer/inference/csrc/pt_binding.cpp b/csrc/transformer/inference/csrc/pt_binding.cpp
index 320e6491b1cd1cb87749e8c8cb8624871b1cc904..1d6857042ca158821e67267517a05dac35f3ef1c 100644
--- a/csrc/transformer/inference/csrc/pt_binding.cpp
+++ b/csrc/transformer/inference/csrc/pt_binding.cpp
@@ -1,39 +1,89 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
 
-#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAStream.h>
 #include <torch/extension.h>
+#include <stdexcept>
 #include <vector>
-#include "context.h"
-#include "cublas_wrappers.h"
-#include "custom_cuda_layers.h"
+#include "inference_context.h"
+#include "inference_cublas_wrappers.h"
+#include "inference_cuda_layers.h"
 
 std::array<int, 3> gemm_algos = std::array<int, 3>({99, 99, 99});
 
-#define MAX_OUT_TOKES 10
+// NOTE: This activation function type enum should be always in sync
+// with the python counterpart, otherwise the casting from python binding
+// will be incorrect.
+enum class ActivationFuncType { UNKNOWN = 0, GELU = 1, ReLU = 2 };
+
+enum class TransformerType : uint8_t { UNKNOWN = 0, GPTType = 1, BERTType = 2 };
+
+// NOTE: this is a temporary and dodgy solution to distinguish GPT and BERT style models
+// based on the dimensions of the corresponding attention mask.
+inline auto infer_transformer_type(at::Tensor& attn_mask) -> TransformerType
+{
+    auto attn_mask_num_dims = attn_mask.sizes().size();
+
+    if (attn_mask_num_dims > 2) {
+        return TransformerType::GPTType;
+    } else if (attn_mask_num_dims == 2) {
+        return TransformerType::BERTType;
+    } else {
+        return TransformerType::UNKNOWN;
+    }
+}
+
+// infer stride of attention mask memory layout based on the model type.
+inline auto get_attn_mask_stride(at::Tensor& attn_mask) -> int
+{
+    auto trnsfrmr_type = infer_transformer_type(attn_mask);
+
+    if (trnsfrmr_type == TransformerType::GPTType) {
+        return attn_mask.size(2);
+    } else if (trnsfrmr_type == TransformerType::BERTType) {
+        // Bert style models have always a mask stride of 1.
+        return 1;
+    } else if (trnsfrmr_type == TransformerType::UNKNOWN) {
+        return 0;
+    }
+
+    // this is just to make the compiler happy.
+    return 0;
+}
 
 template <typename T>
 at::Tensor ds_softmax(at::Tensor& attn_scores,
                       at::Tensor& attn_mask,
+                      at::Tensor& alibi,
                       bool triangular,
                       bool recompute,
                       bool local_attention,
                       int window_size,
-                      bool async_op)
+                      bool async_op,
+                      float layer_scale,
+                      int head_offset,
+                      int mp_size)
 {
     auto attn_scores_c = attn_scores.contiguous();
     int bsz = attn_scores_c.size(0);
 
     int seq_len = attn_scores_c.size(1);
     int len = attn_scores_c.sizes().size();
-    if (len > 3) seq_len = attn_scores_c.size(2);
+    if (len > 2) seq_len = attn_scores_c.size(2);
 
     int soft_len = attn_scores_c.size(2);
     if (len > 3) soft_len = attn_scores_c.size(3);
 
     int heads = 1;
-    if (len > 3) heads = attn_scores_c.size(1);
+    if (len > 1) heads = attn_scores_c.size(1);
+
+    auto mask_stride = get_attn_mask_stride(attn_mask);
 
     launch_attn_softmax_v2((T*)attn_scores_c.data_ptr(),
                            (attn_mask.sizes().size() > 1 ? (T*)attn_mask.data_ptr() : nullptr),
+                           (alibi.sizes().size() > 1 ? (T*)alibi.data_ptr() : nullptr),
+                           layer_scale,
                            triangular,
                            recompute,
                            local_attention,
@@ -42,20 +92,35 @@ at::Tensor ds_softmax(at::Tensor& attn_scores,
                            heads,
                            seq_len,
                            soft_len,
-                           1.0,
+                           head_offset,
+                           mask_stride,
+                           mp_size,
                            Context::Instance().GetCurrentStream(async_op));
 
     return attn_scores_c;
 }
 
 template <typename T>
-void allocate_workspace(size_t hidden_dim,
-                        size_t max_seq_len,
-                        size_t batch_size,
-                        size_t head_size = 128)
+void allocate_workspace(unsigned hidden_dim,
+                        unsigned num_heads,
+                        unsigned prompt_length,
+                        unsigned batch_size,
+                        unsigned num_layers,
+                        unsigned mp_size = 1,
+                        bool external_cache = false,
+                        unsigned rank = 0,
+                        unsigned max_out_tokens = 1024)
 {
-    size_t _workSpaceSize = (hidden_dim * batch_size * max_seq_len);
-    Context::Instance().GenWorkSpace(_workSpaceSize * sizeof(T));
+    Context::Instance().GenWorkSpace(num_layers,
+                                     num_heads,
+                                     batch_size,
+                                     prompt_length,
+                                     hidden_dim,
+                                     mp_size,
+                                     external_cache,
+                                     sizeof(T),
+                                     rank,
+                                     max_out_tokens);
 }
 
 template <typename T>
@@ -70,10 +135,13 @@ at::Tensor einsum_sec_sm_ecm(at::Tensor& Q, at::Tensor& W)
     float alpha = 1;
     float gemm_beta = 0.0;
 
-    if (!workspace) {
-        allocate_workspace<T>(W.size(1), MAX_OUT_TOKES, Q.size(0));
-        workspace = (T*)Context::Instance().GetWorkSpace();
+    /*
+    // Reallocate memory if we received a new prompt
+    if (!workspace || input.size(1) != 1) {
+        allocate_workspace<T>(W.size(1), Context::Instance().GetMaxTokenLenght(), Q.size(0), 1,
+    head_size); workspace = (T*)Context::Instance().GetWorkSpace();
     }
+    */
 
     auto O = at::from_blob(workspace, {Q.size(1), Q.size(2), W.size(1)}, options);
     unsigned m = W.size(1);
@@ -123,6 +191,9 @@ void attention_unfused(at::Tensor& prev_key_cont,
     float gemm_beta = 0.0;
     auto attn_score = at::empty({bsz, heads, seq_len, soft_len}, options);
     int k = prev_value_cont.size(2) / heads;
+
+    auto mask_stride = get_attn_mask_stride(attn_mask);
+
     cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
     cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
                                 soft_len,
@@ -144,8 +215,22 @@ void attention_unfused(at::Tensor& prev_key_cont,
 #else
                                 CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
-    attn_score = ds_softmax<T>(
-        attn_score, attn_mask, triangular, recompute, local_attention, window_size, false);
+    launch_attn_softmax_v2((T*)attn_score.data_ptr(),
+                           (T*)(attn_mask.sizes().size() > 1 ? attn_mask.data_ptr() : nullptr),
+                           (T*)nullptr,
+                           1.0,
+                           triangular,
+                           recompute,
+                           local_attention,
+                           window_size,
+                           bsz,
+                           heads,
+                           seq_len,
+                           soft_len,
+                           0,
+                           mask_stride,
+                           1,
+                           Context::Instance().GetCurrentStream(false));
     alpha = 1.0;
     cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
                                 k,
@@ -170,19 +255,19 @@ void attention_unfused(at::Tensor& prev_key_cont,
 }
 
 template <typename T>
-std::vector<at::Tensor> ds_softmax_context(at::Tensor& query,
-                                           at::Tensor& prev_key,
-                                           at::Tensor& new_key,
-                                           at::Tensor& attn_mask,
-                                           at::Tensor& prev_value,
-                                           at::Tensor& new_value,
-                                           int heads,
-                                           float norm_factor,
-                                           bool merging,
-                                           bool triangular,
-                                           bool local_attention,
-                                           int window_size,
-                                           bool no_masking)
+std::vector<at::Tensor> ds_softmax_context1(at::Tensor& query,
+                                            at::Tensor& prev_key,
+                                            at::Tensor& new_key,
+                                            at::Tensor& attn_mask,
+                                            at::Tensor& prev_value,
+                                            at::Tensor& new_value,
+                                            int heads,
+                                            float norm_factor,
+                                            bool merging,
+                                            bool triangular,
+                                            bool local_attention,
+                                            int window_size,
+                                            bool no_masking)
 {
     auto query_cont = query.contiguous();
     auto prev_key_cont = prev_key.contiguous();
@@ -222,6 +307,230 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query,
     return {output, prev_key, prev_value};
 }
 
+template <typename T>
+void ds_softmax_internal(T* attn_scores,
+                         at::Tensor& attn_mask,
+                         at::Tensor& alibi,
+                         float& layer_scale,
+                         bool triangular,
+                         bool recompute,
+                         bool local_attention,
+                         int window_size,
+                         int bsz,
+                         int seq_len,
+                         int soft_len,
+                         int heads)
+{
+    auto mask_stride = get_attn_mask_stride(attn_mask);
+
+    launch_attn_softmax_v2((T*)attn_scores,
+                           (attn_mask.sizes().size() > 1 ? (T*)attn_mask.data_ptr() : nullptr),
+                           (alibi.sizes().size() > 1 ? (T*)alibi.data_ptr() : nullptr),
+                           layer_scale,
+                           triangular,
+                           recompute,
+                           local_attention,
+                           window_size,
+                           bsz,
+                           heads,
+                           seq_len,
+                           soft_len,
+                           0,
+                           mask_stride,
+                           1,
+                           at::cuda::getCurrentCUDAStream());
+}
+
+template <typename T>
+void attention_unfused(T* prev_key_cont,
+                       T* query_cont,
+                       at::Tensor& attn_mask,
+                       T* prev_value_cont,
+                       T* output,
+                       unsigned& bsz,
+                       int& k,
+                       unsigned& seq_len,
+                       unsigned& soft_len,
+                       int& heads,
+                       float& norm_factor,
+                       bool triangular,
+                       bool recompute,
+                       bool local_attention,
+                       int window_size,
+                       at::Tensor& alibi,
+                       int layer_id)
+{
+    float layer_scale = alibi.sizes().size() > 1 ? std::max(1, layer_id) : 1.0;
+    float alpha = norm_factor * norm_factor / layer_scale;
+    float gemm_beta = 0.0;
+    T* workspace = (T*)Context::Instance().GetAttentionUnfusedWorkspace();
+
+    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
+    cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
+                                soft_len,
+                                seq_len,
+                                k,
+                                &alpha,
+                                &gemm_beta,
+                                (T*)prev_key_cont,
+                                (T*)query_cont,
+                                workspace,
+                                CUBLAS_OP_T,
+                                CUBLAS_OP_N,
+                                Context::Instance().GetMaxTokenLenght() * k,
+                                seq_len * k,
+                                seq_len * soft_len,
+                                bsz * heads,
+#ifdef __HIP_PLATFORM_HCC__
+                                rocblas_gemm_algo_standard);
+#else
+                                CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
+    ds_softmax_internal<T>(workspace,
+                           attn_mask,
+                           alibi,
+                           layer_scale,
+                           triangular,
+                           recompute,
+                           local_attention,
+                           window_size,
+                           bsz,
+                           seq_len,
+                           soft_len,
+                           heads);
+    alpha = 1.0;
+    cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
+                                k,
+                                seq_len,
+                                soft_len,
+                                &alpha,
+                                &gemm_beta,
+                                (T*)prev_value_cont,
+                                workspace,
+                                (T*)output,
+                                CUBLAS_OP_N,
+                                CUBLAS_OP_N,
+                                Context::Instance().GetMaxTokenLenght() * k,
+                                seq_len * soft_len,
+                                seq_len * k,
+                                bsz * heads,
+#ifdef __HIP_PLATFORM_HCC__
+                                rocblas_gemm_algo_standard);
+#else
+                                CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
+}
+
+void reset_cache() { Context::Instance().reset_tokens(); }
+
+template <typename T>
+std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
+                                           at::Tensor& attn_mask,
+                                           int rotary_dim,
+                                           bool rotate_half,
+                                           bool rotate_every_two,
+                                           int heads,
+                                           float norm_factor,
+                                           bool triangular,
+                                           bool local_attention,
+                                           int window_size,
+                                           bool no_masking,
+                                           unsigned layer_id,
+                                           unsigned num_layers,
+                                           at::Tensor& alibi)
+{
+    unsigned bsz = query_key_value.size(0);
+    unsigned seq_len = query_key_value.size(1);
+    unsigned hidden_dim = query_key_value.size(2) / 3;
+
+    bool is_prompt = (seq_len > 1);
+
+    if (is_prompt) Context::Instance().reset_tokens(seq_len);
+    unsigned soft_len = Context::Instance().current_tokens();
+
+    int k = hidden_dim / heads;
+    auto options = at::TensorOptions()
+                       .dtype(query_key_value.options().dtype())
+                       .layout(at::kStrided)
+                       .device(at::kCUDA)
+                       .requires_grad(false);
+
+    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    size_t buf_size = bsz * seq_len * hidden_dim;
+    auto output = torch::from_blob(workspace + 4 * buf_size, {bsz, seq_len, hidden_dim}, options);
+
+    auto query_cont = workspace + 8 * buf_size;
+    size_t offset = 16 * (hidden_dim * bsz * Context::Instance().GetMaxTokenLenght()) +
+                    layer_id * 2 * bsz * Context::Instance().GetMaxTokenLenght() * hidden_dim;
+    unsigned all_tokens = soft_len;
+    auto kv_cache = workspace + offset + (hidden_dim / heads) * (is_prompt ? 0 : soft_len - 1);
+    size_t value_offset = bsz * Context::Instance().GetMaxTokenLenght() * hidden_dim;
+
+    T* temp_buf = (T*)output.data_ptr() + at::numel(output);
+    launch_bias_add_transform_0213<T>((T*)query_cont,
+                                      kv_cache,
+                                      kv_cache + value_offset,
+                                      (T*)query_key_value.data_ptr(),
+                                      nullptr,
+                                      bsz,
+                                      seq_len,
+                                      (is_prompt ? 0 : soft_len - 1),
+                                      soft_len,
+                                      hidden_dim,
+                                      heads,
+                                      rotary_dim,
+                                      rotate_half,
+                                      rotate_every_two,
+                                      Context::Instance().GetCurrentStream(),
+                                      3,
+                                      Context::Instance().GetMaxTokenLenght());
+    if (rotary_dim > 0 && rotate_half)
+        launch_apply_rotary_pos_emb(query_cont,
+                                    kv_cache,
+                                    k,
+                                    seq_len,
+                                    rotary_dim,
+                                    (is_prompt ? 0 : soft_len - 1),
+                                    heads,
+                                    bsz,
+                                    rotate_half,
+                                    rotate_every_two,
+                                    Context::Instance().GetCurrentStream(),
+                                    Context::Instance().GetMaxTokenLenght());
+
+    attention_unfused<T>(workspace + offset,
+                         (T*)query_cont,
+                         attn_mask,
+                         workspace + offset + value_offset,
+                         temp_buf,
+                         bsz,
+                         k,
+                         seq_len,
+                         all_tokens,
+                         heads,
+                         norm_factor,
+                         (triangular && is_prompt),
+                         is_prompt,
+                         local_attention,
+                         window_size,
+                         alibi,
+                         layer_id);
+    launch_transform4d_0213<T>((T*)output.data_ptr(),
+                               temp_buf,
+                               bsz,
+                               heads,
+                               seq_len,
+                               output.size(2),
+                               Context::Instance().GetCurrentStream(false),
+                               1);
+
+    if (layer_id == num_layers - 1) Context::Instance().advance_tokens();
+    auto prev_key = torch::from_blob(workspace + offset, {bsz, heads, all_tokens, k}, options);
+    auto prev_value =
+        torch::from_blob(workspace + offset + value_offset, {bsz, heads, all_tokens, k}, options);
+    return {output, prev_key, prev_value};
+}
+
 template <typename T>
 at::Tensor ds_bias_gelu(at::Tensor& input, at::Tensor& bias)
 {
@@ -238,6 +547,73 @@ at::Tensor ds_bias_gelu(at::Tensor& input, at::Tensor& bias)
     return input_cont;
 }
 
+at::Tensor ds_bias_geglu(at::Tensor& activation, at::Tensor& bias)
+{
+    /*
+    Used in FF of Stable diffusion
+    */
+
+    const int batch_size = activation.size(0);
+    const int seq_len = activation.size(1);
+    const int channels = activation.size(2);
+
+    const int rows = batch_size * seq_len;
+    // Dimensionality is cut in half
+    const int out_channels = channels / 2;
+
+    auto output = at::empty({batch_size, seq_len, out_channels}, activation.options());
+
+    if (activation.options().dtype() == torch::kFloat32) {
+        launch_fused_bias_geglu((float*)output.data_ptr(),
+                                (const float*)activation.data_ptr(),
+                                (const float*)bias.data_ptr(),
+                                rows,
+                                channels,
+                                Context::Instance().GetCurrentStream());
+    } else {
+        launch_fused_bias_geglu((__half*)output.data_ptr(),
+                                (const __half*)activation.data_ptr(),
+                                (const __half*)bias.data_ptr(),
+                                rows,
+                                channels,
+                                Context::Instance().GetCurrentStream());
+    }
+
+    return output;
+}
+
+template <typename T>
+at::Tensor ds_bias_relu(at::Tensor& input, at::Tensor& bias)
+{
+    auto input_cont = input.contiguous();
+
+    int bsz = input_cont.size(0) * input_cont.size(1);
+    int intermediate_size = input_cont.size(2);
+
+    launch_bias_relu((T*)input_cont.data_ptr(),
+                     (T*)bias.data_ptr(),
+                     intermediate_size,
+                     bsz,
+                     Context::Instance().GetCurrentStream());
+    return input_cont;
+}
+
+template <typename T>
+at::Tensor ds_bias_add(at::Tensor& input, at::Tensor& bias)
+{
+    auto input_cont = input.contiguous();
+
+    int bsz = input_cont.size(0) * input_cont.size(1);
+    int hidden_size = input_cont.size(2);
+
+    launch_bias_add((T*)input_cont.data_ptr(),
+                    (T*)bias.data_ptr(),
+                    hidden_size,
+                    bsz,
+                    Context::Instance().GetCurrentStream());
+    return input_cont;
+}
+
 template <typename T>
 at::Tensor ds_bias_residual(at::Tensor& input, at::Tensor& residual, at::Tensor& bias)
 {
@@ -255,85 +631,260 @@ at::Tensor ds_bias_residual(at::Tensor& input, at::Tensor& residual, at::Tensor&
     return input_cont;
 }
 
+at::Tensor ds_layer_norm(at::Tensor& input, at::Tensor& gamma, at::Tensor& beta, float epsilon)
+{
+    const int rows = input.size(0) * input.size(1);
+    const int elems_per_row = input.size(2);
+    auto output = at::empty_like(input);
+
+    if (input.options().dtype() == torch::kFloat16) {
+        launch_fused_ln((__half*)output.data_ptr(),
+                        (const __half*)input.data_ptr(),
+                        (const __half*)gamma.data_ptr(),
+                        (const __half*)beta.data_ptr(),
+                        epsilon,
+                        rows,
+                        elems_per_row,
+                        Context::Instance().GetCurrentStream());
+    } else {
+        launch_fused_ln((float*)output.data_ptr(),
+                        (const float*)input.data_ptr(),
+                        (const float*)gamma.data_ptr(),
+                        (const float*)beta.data_ptr(),
+                        epsilon,
+                        rows,
+                        elems_per_row,
+                        Context::Instance().GetCurrentStream());
+    }
+
+    return output;
+}
+
 template <typename T>
-at::Tensor ds_layernorm(at::Tensor& input_cont, at::Tensor& gamma, at::Tensor& betta, float epsilon)
+void ds_layer_norm_internal(T* workspace,
+                            at::Tensor& input,
+                            at::Tensor& gamma,
+                            at::Tensor& beta,
+                            float epsilon)
 {
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    auto inp_norm = at::empty_like(input_cont);
-    launch_layer_norm((T*)inp_norm.data_ptr(),
-                      (T*)input_cont.data_ptr(),
-                      (T*)gamma.data_ptr(),
-                      (T*)betta.data_ptr(),
-                      epsilon,
-                      bsz,
-                      input_cont.size(2),
-                      Context::Instance().GetCurrentStream());
-    return inp_norm;
+    int bsz = input.size(0) * input.size(1);
+    launch_fused_ln(workspace,
+                    (const T*)input.data_ptr(),
+                    (const T*)gamma.data_ptr(),
+                    (const T*)beta.data_ptr(),
+                    epsilon,
+                    bsz,
+                    input.size(2),
+                    Context::Instance().GetCurrentStream());
+}
+
+/* Currently only used in unit testing */
+at::Tensor ds_layer_norm_residual(at::Tensor& input,
+                                  at::Tensor& bias,
+                                  at::Tensor& residual,
+                                  at::Tensor& gamma,
+                                  at::Tensor& beta,
+                                  float epsilon)
+{
+    const int rows = input.size(0) * input.size(1);
+    const int elems_per_row = input.size(2);
+    auto output = at::empty_like(input);
+
+    if (input.options().dtype() == torch::kFloat16) {
+        launch_fused_residual_ln((__half*)output.data_ptr(),
+                                 (const __half*)input.data_ptr(),
+                                 (const __half*)residual.data_ptr(),
+                                 (const __half*)bias.data_ptr(),
+                                 (const __half*)gamma.data_ptr(),
+                                 (const __half*)beta.data_ptr(),
+                                 epsilon,
+                                 rows,
+                                 elems_per_row,
+                                 Context::Instance().GetCurrentStream());
+    } else {
+        launch_fused_residual_ln((float*)output.data_ptr(),
+                                 (const float*)input.data_ptr(),
+                                 (const float*)residual.data_ptr(),
+                                 (const float*)bias.data_ptr(),
+                                 (const float*)gamma.data_ptr(),
+                                 (const float*)beta.data_ptr(),
+                                 epsilon,
+                                 rows,
+                                 elems_per_row,
+                                 Context::Instance().GetCurrentStream());
+    }
+
+    return output;
+}
+
+/* Currently only used in unit testing */
+std::vector<at::Tensor> ds_layer_norm_residual_store_pre_ln_res(at::Tensor& input,
+                                                                at::Tensor& bias,
+                                                                at::Tensor& residual,
+                                                                at::Tensor& gamma,
+                                                                at::Tensor& beta,
+                                                                float epsilon)
+{
+    const int rows = input.size(0) * input.size(1);
+    const int elems_per_row = input.size(2);
+    auto norm_output = at::empty_like(input);
+    auto res_output = at::empty_like(input);
+
+    if (input.options().dtype() == torch::kFloat16) {
+        launch_fused_residual_ln_store_pre_ln_res((__half*)norm_output.data_ptr(),
+                                                  (__half*)res_output.data_ptr(),
+                                                  (const __half*)input.data_ptr(),
+                                                  (const __half*)residual.data_ptr(),
+                                                  (const __half*)bias.data_ptr(),
+                                                  (const __half*)gamma.data_ptr(),
+                                                  (const __half*)beta.data_ptr(),
+                                                  epsilon,
+                                                  rows,
+                                                  elems_per_row,
+                                                  Context::Instance().GetCurrentStream());
+    } else {
+        launch_fused_residual_ln_store_pre_ln_res((float*)norm_output.data_ptr(),
+                                                  (float*)res_output.data_ptr(),
+                                                  (const float*)input.data_ptr(),
+                                                  (const float*)residual.data_ptr(),
+                                                  (const float*)bias.data_ptr(),
+                                                  (const float*)gamma.data_ptr(),
+                                                  (const float*)beta.data_ptr(),
+                                                  epsilon,
+                                                  rows,
+                                                  elems_per_row,
+                                                  Context::Instance().GetCurrentStream());
+    }
+
+    return {norm_output, res_output};
 }
 
 template <typename T>
-at::Tensor qkv_unfused_cublas(at::Tensor& output,
-                              at::Tensor& input,
-                              at::Tensor& weight,
-                              at::Tensor& bias,
-                              at::Tensor& gamma,
-                              at::Tensor& beta,
-                              const float epsilon,
-                              bool add_bias)
+void quantized_gemm(void* output,
+                    T* input,
+                    at::Tensor& weight,
+                    at::Tensor& qscale,
+                    int groups,
+                    int bsz,
+                    int hidden_size)
 {
-    auto inp_norm = ds_layernorm<T>(input, gamma, beta, epsilon);
+    // T* weight16 = (T*)Context::Instance().GetWorkSpace() + 12 * hidden_size * bsz;
 
-    // cudaEventRecord(Context::Instance().GetCompEvent(1), Context::Instance().GetCurrentStream());
+    auto options = at::TensorOptions()
+                       .dtype(at::kHalf)
+                       .layout(at::kStrided)
+                       .device(at::kCUDA)
+                       .requires_grad(false);
+    auto tmp = torch::empty(weight.sizes(), options);
+    T* weight16 = (T*)tmp.data_ptr();
+    launch_dequantize(weight16,
+                      (int8_t*)weight.data_ptr(),
+                      (float*)qscale.data_ptr(),
+                      weight.size(0),
+                      weight.size(1),
+                      groups,
+                      Context::Instance().GetCurrentStream());
 
     float alpha = (T)1.0;
     float gemm_beta = (T)0.0;
-    int bsz = input.size(0) * input.size(1);
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
     cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+                   CUBLAS_OP_T,
                    CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   weight.size(1),
+                   weight.size(0),
                    bsz,
-                   input.size(2),
+                   weight.size(1),
                    &alpha,
                    &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)inp_norm.data_ptr(),
-                   (T*)output.data_ptr(),
+                   weight16,
+                   (T*)input,
+                   (T*)output,
 #ifdef __HIP_PLATFORM_HCC__
                    rocblas_gemm_algo_standard);
 #else
                    CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
+}
+
+template <typename T>
+at::Tensor qkv_unfused_cublas(at::Tensor& output,
+                              at::Tensor& input,
+                              at::Tensor& weight,
+                              at::Tensor& q_scale,
+                              at::Tensor& bias,
+                              at::Tensor& gamma,
+                              at::Tensor& beta,
+                              const float epsilon,
+                              bool add_bias,
+                              bool q_int8)
+{
+    int bsz = input.size(0) * input.size(1);
+    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    workspace += (3 * bsz * input.size(2));
+    ds_layer_norm_internal<T>(workspace, input, gamma, beta, epsilon);
+
+    if (q_int8) {
+        quantized_gemm<T>(
+            output.data_ptr(), workspace, weight, q_scale, q_scale.size(0), bsz, input.size(2));
+    } else {
+        float alpha = (T)1.0;
+        float gemm_beta = (T)0.0;
+
+        cublasSetStream(Context::Instance().GetCublasHandle(),
+                        Context::Instance().GetCurrentStream());
+        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_N,
+                       weight.size(1),
+                       bsz,
+                       input.size(2),
+                       &alpha,
+                       &gemm_beta,
+                       (T*)weight.data_ptr(),
+                       workspace,
+                       (T*)output.data_ptr(),
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo_standard);
+#else
+                       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
+    }
     if (add_bias)
         launch_bias_add((T*)output.data_ptr(),
                         (T*)bias.data_ptr(),
-                        weight.size(1),
+                        q_int8 ? weight.size(0) : weight.size(1),
                         bsz,
                         Context::Instance().GetCurrentStream());
-    return inp_norm;
+    return torch::from_blob(workspace, input.sizes(), input.options());
 }
 
 template <typename T>
 std::vector<at::Tensor> ds_qkv_gemm(at::Tensor& input,
                                     at::Tensor& weight,
+                                    at::Tensor& q_scale,
                                     at::Tensor& bias,
                                     at::Tensor& gamma,
                                     at::Tensor& beta,
                                     const float epsilon,
-                                    bool add_bias)
+                                    bool add_bias,
+                                    unsigned num_layers,
+                                    bool external_cache,
+                                    unsigned mp_size,
+                                    unsigned rank,
+                                    bool q_int8)
 {
-    auto input_cont = input.contiguous();
+    int bsz = input.size(0) * input.size(1);
+    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    int out_size = q_int8 ? weight.size(0) : weight.size(1);
+
     auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
+                       .dtype(input.options().dtype())
                        .layout(at::kStrided)
                        .device(at::kCUDA)
                        .requires_grad(false);
 
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    auto inp_norm =
-        qkv_unfused_cublas<T>(output, input_cont, weight, bias, gamma, beta, epsilon, add_bias);
+    auto output = at::from_blob(workspace, {input.size(0), input.size(1), out_size}, options);
+    auto inp_norm = qkv_unfused_cublas<T>(
+        output, input, weight, q_scale, bias, gamma, beta, epsilon, add_bias, q_int8);
 
     return {output, inp_norm};
 }
@@ -357,20 +908,18 @@ void quantized_gemm(at::Tensor& output,
     launch_dequantize((T*)weight16.data_ptr(),
                       (int8_t*)weight.data_ptr(),
                       (float*)qscale.data_ptr(),
-                      weight.size(1),
                       weight.size(0),
+                      weight.size(1),
                       groups,
                       merge_count,
                       Context::Instance().GetCurrentStream());
 
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-
     float alpha = (T)1.0;
     float gemm_beta = (T)0.0;
     cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+                   CUBLAS_OP_T,
                    CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   weight.size(1),
+                   weight.size(0),
                    bsz,
                    input.size(2),
                    &alpha,
@@ -406,7 +955,7 @@ at::Tensor ds_qkv_gemm_int8(at::Tensor& input,
 
     auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
 
-    auto inp_norm = ds_layernorm<T>(input_cont, gamma, beta, epsilon);
+    auto inp_norm = ds_layer_norm(input_cont, gamma, beta, epsilon);
 
     quantized_gemm<T>(output, inp_norm, weight, q_scale, groups, 0);
     if (add_bias)
@@ -420,7 +969,12 @@ at::Tensor ds_qkv_gemm_int8(at::Tensor& input,
 }
 
 template <typename T>
-at::Tensor ds_linear_layer(at::Tensor& input, at::Tensor& weight, at::Tensor& bias)
+at::Tensor ds_linear_layer(at::Tensor& input,
+                           at::Tensor& weight,
+                           at::Tensor& bias,
+                           bool add_bias,
+                           bool do_flash_attn,
+                           int num_heads)
 {
     auto input_cont = input.contiguous();
     auto options = at::TensorOptions()
@@ -429,8 +983,10 @@ at::Tensor ds_linear_layer(at::Tensor& input, at::Tensor& weight, at::Tensor& bi
                        .device(at::kCUDA)
                        .requires_grad(false);
 
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
+    int head_size = input_cont.size(2) / num_heads;
+    int bsz = input.size(0) * input.size(1);
+    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    auto output = at::from_blob(workspace, {input.size(0), input.size(1), weight.size(1)}, options);
 
     float alpha = (T)1.0;
     float gemm_beta = (T)0.0;
@@ -452,16 +1008,172 @@ at::Tensor ds_linear_layer(at::Tensor& input, at::Tensor& weight, at::Tensor& bi
 #else
                    CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
+    if (add_bias)
+        launch_bias_add((T*)output.data_ptr(),
+                        (T*)bias.data_ptr(),
+                        weight.size(1),
+                        bsz,
+                        Context::Instance().GetCurrentStream());
+    bool add_padding = (head_size % 32 != 0 && head_size < 64) || (head_size % 64 != 0);
+    if (do_flash_attn) {
+        if (add_padding) {
+            int padded_head_size = head_size < 32 ? 32 : (head_size < 64 ? 64 : 128);
+            auto padded_output = workspace + output.numel();
+            auto final_output =
+                padded_output + (input.size(0) * input.size(1) * 3 * num_heads * padded_head_size);
+            pad_data(padded_output,
+                     workspace,
+                     3 * bsz * num_heads,
+                     head_size,
+                     padded_head_size,
+                     Context::Instance().GetCurrentStream());
 
-    launch_bias_add((T*)output.data_ptr(),
-                    (T*)bias.data_ptr(),
-                    weight.size(1),
-                    bsz,
-                    Context::Instance().GetCurrentStream());
+            launch_bias_add_transform_0213<T>(
+                final_output,
+                final_output + (input.size(0) * input.size(1) * num_heads * padded_head_size),
+                final_output + (input.size(0) * input.size(1) * 2 * num_heads * padded_head_size),
+                padded_output,
+                nullptr,
+                input.size(0),
+                input.size(1),
+                0,
+                input.size(1),
+                (num_heads * padded_head_size),
+                num_heads,
+                -1,
+                false,
+                false,
+                Context::Instance().GetCurrentStream(),
+                3,
+                input.size(1));
+            return at::from_blob(final_output,
+                                 {3, input.size(0), num_heads, input.size(1), padded_head_size},
+                                 options);
+            // return at::from_blob(padded_output, {input.size(0) * input.size(1), 3, num_heads,
+            // padded_head_size}, options);
+        } else {
+            auto final_output = workspace + output.numel();
+            launch_bias_add_transform_0213<T>(
+                final_output,
+                final_output + (input.size(0) * input.size(1) * input_cont.size(2)),
+                final_output + (input.size(0) * input.size(1) * 2 * input_cont.size(2)),
+                workspace,
+                nullptr,
+                input.size(0),
+                input.size(1),
+                0,
+                input.size(1),
+                input_cont.size(2),
+                num_heads,
+                -1,
+                false,
+                false,
+                Context::Instance().GetCurrentStream(),
+                3,
+                input.size(1));
+            return at::from_blob(
+                final_output, {3, input.size(0), num_heads, input.size(1), head_size}, options);
+            // return at::from_blob(workspace, {input.size(0) * input.size(1), 3, num_heads,
+            // head_size}, options);
+        }
+
+    } else
+        return output;
+}
 
-    return output;
+template <typename T>
+std::vector<at::Tensor> add_padding(at::Tensor& query, at::Tensor& key, at::Tensor& value)
+{
+    int head_size = query.size(3);
+    int padded_head_size = head_size < 32 ? 32 : (head_size < 64 ? 64 : 128);
+    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    T* key_pad_ptr = workspace + padded_head_size * query.size(0) * query.size(1) * query.size(2);
+    T* value_pad_ptr = key_pad_ptr + padded_head_size * query.size(0) * query.size(1) * 128;
+    pad_head_seq(workspace,
+                 (T*)query.data_ptr(),
+                 query.size(0) * query.size(1),
+                 query.size(2),
+                 query.size(2),
+                 head_size,
+                 padded_head_size,
+                 Context::Instance().GetCurrentStream());
+    pad_head_seq(key_pad_ptr,
+                 (T*)key.data_ptr(),
+                 query.size(0) * query.size(1),
+                 key.size(2),
+                 128,
+                 head_size,
+                 padded_head_size,
+                 Context::Instance().GetCurrentStream());
+    pad_head_seq(value_pad_ptr,
+                 (T*)value.data_ptr(),
+                 query.size(0) * query.size(1),
+                 key.size(2),
+                 128,
+                 head_size,
+                 padded_head_size,
+                 Context::Instance().GetCurrentStream());
+    return {
+        at::from_blob(workspace,
+                      {query.size(0), query.size(1), query.size(2), padded_head_size},
+                      query.options()),
+        at::from_blob(
+            key_pad_ptr, {query.size(0), query.size(1), 128, padded_head_size}, query.options()),
+        at::from_blob(
+            value_pad_ptr, {query.size(0), query.size(1), 128, padded_head_size}, query.options())};
 }
 
+template <typename T>
+std::vector<at::Tensor> padd_add_transform(at::Tensor& query,
+                                           at::Tensor& key,
+                                           at::Tensor& value,
+                                           int heads,
+                                           bool add_padding)
+{
+    int head_size = query.size(2) / heads;
+    int key_value_length = add_padding ? 128 : key.size(1);
+    int padded_head_size = add_padding ? (head_size < 32 ? 32 : (head_size < 64 ? 64 : 128))
+                                       : head_size;
+    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    T* key_pad_ptr = workspace + padded_head_size * query.size(0) * heads * query.size(1);
+    T* value_pad_ptr = key_pad_ptr + padded_head_size * query.size(0) * heads * key_value_length;
+    launch_pad_add_transform_0213(workspace,
+                                  (T*)query.data_ptr(),
+                                  query.size(0),
+                                  query.size(2),
+                                  query.size(1),
+                                  query.size(1),
+                                  heads,
+                                  padded_head_size,
+                                  Context::Instance().GetCurrentStream());
+    launch_pad_add_transform_0213(key_pad_ptr,
+                                  (T*)key.data_ptr(),
+                                  key.size(0),
+                                  key.size(2),
+                                  key.size(1),
+                                  key_value_length,
+                                  heads,
+                                  padded_head_size,
+                                  Context::Instance().GetCurrentStream());
+    launch_pad_add_transform_0213(value_pad_ptr,
+                                  (T*)value.data_ptr(),
+                                  value.size(0),
+                                  value.size(2),
+                                  value.size(1),
+                                  key_value_length,
+                                  heads,
+                                  padded_head_size,
+                                  Context::Instance().GetCurrentStream());
+    return {
+        at::from_blob(
+            workspace, {query.size(0), heads, query.size(1), padded_head_size}, query.options()),
+        at::from_blob(key_pad_ptr,
+                      {query.size(0), heads, key_value_length, padded_head_size},
+                      query.options()),
+        at::from_blob(value_pad_ptr,
+                      {query.size(0), heads, key_value_length, padded_head_size},
+                      query.options())};
+}
 template <typename T>
 at::Tensor ds_linear_layer_int8(at::Tensor& input,
                                 at::Tensor& weight,
@@ -489,37 +1201,52 @@ at::Tensor ds_linear_layer_int8(at::Tensor& input,
 }
 
 template <typename T>
-at::Tensor ds_vector_matmul(at::Tensor& input, at::Tensor& weight, bool async_op)
+at::Tensor ds_vector_matmul(at::Tensor& input,
+                            at::Tensor& weight,
+                            bool async_op,
+                            at::Tensor& q_scale,
+                            bool q_int8)
 {
-    auto input_cont = input.contiguous();
     auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
+                       .dtype(input.options().dtype())
                        .layout(at::kStrided)
                        .device(at::kCUDA)
                        .requires_grad(false);
+    int out_size = q_int8 ? weight.size(0) : weight.size(1);
+    int bsz = input.size(0) * input.size(1);
 
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    cublasSetStream(Context::Instance().GetCublasHandle(),
-                    Context::Instance().GetCurrentStream(async_op));
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   weight.size(1),
-                   bsz,
-                   input_cont.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)input_cont.data_ptr(),
-                   (T*)output.data_ptr(),
+    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    auto output = at::from_blob(workspace, {input.size(0), input.size(1), out_size}, options);
+    if (q_int8) {
+        quantized_gemm<T>(output.data_ptr(),
+                          (T*)input.data_ptr(),
+                          weight,
+                          q_scale,
+                          q_scale.size(0),
+                          bsz,
+                          input.size(2));
+    } else {
+        float alpha = (T)1.0;
+        float gemm_beta = (T)0.0;
+        cublasSetStream(Context::Instance().GetCublasHandle(),
+                        Context::Instance().GetCurrentStream(async_op));
+        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_N,
+                       weight.size(1),
+                       bsz,
+                       input.size(2),
+                       &alpha,
+                       &gemm_beta,
+                       (T*)weight.data_ptr(),
+                       (T*)input.data_ptr(),
+                       (T*)output.data_ptr(),
 #ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
+                       rocblas_gemm_algo_standard);
 #else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+                       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
+    }
     return output;
 }
 
@@ -544,95 +1271,163 @@ at::Tensor ds_vector_matmul_int8(at::Tensor& input,
 }
 
 template <typename T>
-void mlp_unfused_cublas(at::Tensor& output,
-                        at::Tensor& input,
-                        at::Tensor& residual,
-                        at::Tensor& input_bias,
-                        at::Tensor& weight,
-                        at::Tensor& bias,
-                        at::Tensor& gamma,
-                        at::Tensor& beta,
-                        const float epsilon,
-                        bool preLayerNorm,
-                        bool mlp_after_attn)
+at::Tensor mlp_unfused_cublas(at::Tensor& output,
+                              at::Tensor& input,
+                              at::Tensor& residual,
+                              at::Tensor& input_bias,
+                              at::Tensor& weight,
+                              at::Tensor& weight1,
+                              at::Tensor& bias,
+                              at::Tensor& gamma,
+                              at::Tensor& beta,
+                              const float epsilon,
+                              bool preLayerNorm,
+                              bool mlp_after_attn,
+                              at::Tensor& q_scale,
+                              at::Tensor& q_scale1,
+                              bool q_int8,
+                              ActivationFuncType act_func_type)
 {
     int bsz = input.size(0) * input.size(1);
-    auto inp_norm = at::empty_like(input);
-
-    launch_residual_layer_norm((T*)inp_norm.data_ptr(),
-                               (T*)nullptr,
-                               (T*)input.data_ptr(),
-                               (T*)residual.data_ptr(),
-                               (T*)input_bias.data_ptr(),
-                               (T*)gamma.data_ptr(),
-                               (T*)beta.data_ptr(),
-                               epsilon,
-                               bsz,
-                               input.size(2),
-                               preLayerNorm,
-                               mlp_after_attn,
-                               Context::Instance().GetCurrentStream());
+    T* inp_norm =
+        (T*)Context::Instance().GetWorkSpace() + torch::numel(input) + torch::numel(output);
+    T* intermediate = inp_norm + torch::numel(input);
+
+    if (mlp_after_attn) {
+        launch_fused_residual_ln((T*)inp_norm,
+                                 (const T*)input.data_ptr(),
+                                 (const T*)residual.data_ptr(),
+                                 (const T*)input_bias.data_ptr(),
+                                 (const T*)gamma.data_ptr(),
+                                 (const T*)beta.data_ptr(),
+                                 epsilon,
+                                 bsz,
+                                 input.size(2),
+                                 Context::Instance().GetCurrentStream());
+    } else {
+        ds_layer_norm_internal(inp_norm, input, gamma, beta, epsilon);
+    }
+    if (q_int8) {
+        quantized_gemm<T>(
+            intermediate, inp_norm, weight, q_scale, q_scale.size(0), bsz, input.size(2));
+    } else {
+        float alpha = (T)1.0;
+        float gemm_beta = (T)0.0;
+        cublasSetStream(Context::Instance().GetCublasHandle(),
+                        Context::Instance().GetCurrentStream());
+        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_N,
+                       weight.size(1),
+                       bsz,
+                       input.size(2),
+                       &alpha,
+                       &gemm_beta,
+                       (T*)weight.data_ptr(),
+                       inp_norm,
+                       intermediate,
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo_standard);
+#else
+                       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
+    }
+    if (act_func_type == ActivationFuncType::GELU) {
+        launch_bias_gelu(intermediate,
+                         (T*)bias.data_ptr(),
+                         q_int8 ? weight.size(0) : weight.size(1),
+                         bsz,
+                         Context::Instance().GetCurrentStream());
+    } else if (act_func_type == ActivationFuncType::ReLU) {
+        launch_bias_relu(intermediate,
+                         (T*)bias.data_ptr(),
+                         q_int8 ? weight.size(0) : weight.size(1),
+                         bsz,
+                         Context::Instance().GetCurrentStream());
+    }
 
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   weight.size(1),
-                   bsz,
-                   input.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)inp_norm.data_ptr(),
-                   (T*)output.data_ptr(),
+    if (q_int8) {
+        quantized_gemm<T>(output.data_ptr(),
+                          intermediate,
+                          weight1,
+                          q_scale1,
+                          q_scale1.size(0),
+                          bsz,
+                          input.size(2));
+    } else {
+        float alpha = (T)1.0;
+        float gemm_beta = (T)0.0;
+        cublasSetStream(Context::Instance().GetCublasHandle(),
+                        Context::Instance().GetCurrentStream());
+        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_N,
+                       weight1.size(1),
+                       bsz,
+                       weight1.size(0),
+                       &alpha,
+                       &gemm_beta,
+                       (T*)weight1.data_ptr(),
+                       intermediate,
+                       (T*)output.data_ptr(),
 #ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
+                       rocblas_gemm_algo_standard);
 #else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+                       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
-    launch_bias_gelu((T*)output.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     weight.size(1),
-                     bsz,
-                     Context::Instance().GetCurrentStream());
+    }
+
+    return torch::from_blob(inp_norm, input.sizes(), input.options());
 }
+
 template <typename T>
-at::Tensor ds_mlp_gemm(at::Tensor& input,
-                       at::Tensor& residual,
-                       at::Tensor& input_bias,
-                       at::Tensor& weight,
-                       at::Tensor& bias,
-                       at::Tensor& gamma,
-                       at::Tensor& beta,
-                       const float epsilon,
-                       bool preLayerNorm,
-                       bool mlp_after_attn)
+std::vector<at::Tensor> ds_mlp_gemm(at::Tensor& input,
+                                    at::Tensor& residual,
+                                    at::Tensor& input_bias,
+                                    at::Tensor& weight_interm,
+                                    at::Tensor& weight_out,
+                                    at::Tensor& bias,
+                                    at::Tensor& gamma,
+                                    at::Tensor& beta,
+                                    const float epsilon,
+                                    bool preLayerNorm,
+                                    bool mlp_after_attn,
+                                    at::Tensor& q_scale,
+                                    at::Tensor& q_scale1,
+                                    bool q_int8,
+                                    int activation_type)
 {
-    auto input_cont = input.contiguous();
     auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
+                       .dtype(input.options().dtype())
                        .layout(at::kStrided)
                        .device(at::kCUDA)
                        .requires_grad(false);
 
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-
-    mlp_unfused_cublas<T>(output,
-                          mlp_after_attn ? input : residual,
-                          residual,
-                          input_bias,
-                          weight,
-                          bias,
-                          gamma,
-                          beta,
-                          epsilon,
-                          preLayerNorm,
-                          mlp_after_attn);
+    int out_size = q_int8 ? weight_out.size(0) : weight_out.size(1);
+    auto output = at::from_blob((T*)Context::Instance().GetWorkSpace() + torch::numel(input),
+                                {input.size(0), input.size(1), out_size},
+                                options);
+    int bsz = input.size(0) * input.size(1);
 
-    return output;
+    auto act_func_type = static_cast<ActivationFuncType>(activation_type);
+    auto res_add = mlp_unfused_cublas<T>(output,
+                                         mlp_after_attn ? input : residual,
+                                         residual,
+                                         input_bias,
+                                         weight_interm,
+                                         weight_out,
+                                         bias,
+                                         gamma,
+                                         beta,
+                                         epsilon,
+                                         preLayerNorm,
+                                         mlp_after_attn,
+                                         q_scale,
+                                         q_scale1,
+                                         q_int8,
+                                         act_func_type);
+
+    return {output, res_add};
 }
 
 template <typename T>
@@ -661,20 +1456,6 @@ std::vector<at::Tensor> ds_mlp_gemm_int8(at::Tensor& input,
     auto inp_norm = at::empty_like(input_cont);
 
     auto residual_add = (preLayerNorm ? at::empty_like(input_cont) : inp_norm);
-    // computing the blocking across K dimension
-    // launch_residual_layer_norm((T*)inp_norm.data_ptr(),
-    //                           (T*)residual_add.data_ptr(),
-    //                           (T*)input_cont.data_ptr(),
-    //                           (T*)residual.data_ptr(),
-    //                           (T*)input_bias.data_ptr(),
-    //                           (T*)gamma.data_ptr(),
-    //                           (T*)beta.data_ptr(),
-    //                           epsilon,
-    //                           bsz,
-    //                           input_cont.size(2),
-    //                           preLayerNorm,
-    //                           Context::Instance().GetCurrentStream());
-
     quantized_gemm<T>(output, inp_norm, weight, q_scale, groups, 0);
     launch_bias_gelu((T*)output.data_ptr(),
                      (T*)bias.data_ptr(),
@@ -688,122 +1469,136 @@ std::vector<at::Tensor> ds_mlp_gemm_int8(at::Tensor& input,
 template <typename T>
 at::Tensor fused_gemm_gelu(at::Tensor& input,
                            at::Tensor& weight,
+                           at::Tensor& weight_scale,
                            at::Tensor& bias,
                            at::Tensor& weight_out,
+                           at::Tensor& weight_out_scale,
                            const float epsilon,
                            bool preLayerNorm,
+                           bool q_int8,
                            bool async_op)
 {
-    auto input_cont = input.contiguous();
     auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
+                       .dtype(input.options().dtype())
                        .layout(at::kStrided)
                        .device(at::kCUDA)
                        .requires_grad(false);
 
-    auto intermediate =
-        at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight_out.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
+    int intm_dim = q_int8 ? weight.size(0) : weight.size(1);
+
+    // auto output = at::from_blob((T*)Context::Instance().GetWorkSpace() + torch::numel(input),
+    //                            {input.size(0), input.size(1), out_size},
+    //                            options);
+    // T* intermediate = (T*)input.data_ptr() + torch::numel(input);
+    auto intermediate = at::empty({input.size(0), input.size(1), intm_dim}, options);
+
+    int bsz = input.size(0) * input.size(1);
+
     float alpha = (T)1.0;
     float gemm_beta = (T)0.0;
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   weight.size(1),
-                   bsz,
-                   input.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)input_cont.data_ptr(),
-                   (T*)intermediate.data_ptr(),
+    if (q_int8) {
+        quantized_gemm<T>(intermediate.data_ptr(),
+                          (T*)input.data_ptr(),
+                          weight,
+                          weight_scale,
+                          weight_scale.size(0),
+                          bsz,
+                          input.size(2));
+    } else {
+        cublasSetStream(Context::Instance().GetCublasHandle(),
+                        Context::Instance().GetCurrentStream());
+        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_N,
+                       intm_dim,
+                       bsz,
+                       input.size(2),
+                       &alpha,
+                       &gemm_beta,
+                       (T*)weight.data_ptr(),
+                       (T*)input.data_ptr(),
+                       (T*)intermediate.data_ptr(),
 #ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
+                       rocblas_gemm_algo_standard);
 #else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+                       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
+    }
     launch_bias_gelu((T*)intermediate.data_ptr(),
                      (T*)bias.data_ptr(),
-                     weight.size(1),
+                     intm_dim,
                      bsz,
                      Context::Instance().GetCurrentStream());
 
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   weight_out.size(1),
-                   bsz,
-                   intermediate.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight_out.data_ptr(),
-                   (T*)intermediate.data_ptr(),
-                   (T*)output.data_ptr(),
+    int out_size = q_int8 ? weight_out.size(0) : weight_out.size(1);
+    auto output = at::empty({input.size(0), input.size(1), out_size}, options);
+    if (q_int8) {
+        quantized_gemm<T>(output.data_ptr(),
+                          (T*)intermediate.data_ptr(),
+                          weight_out,
+                          weight_out_scale,
+                          weight_out_scale.size(0),
+                          bsz,
+                          input.size(2));
+    } else {
+        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_N,
+                       out_size,
+                       bsz,
+                       intm_dim,
+                       &alpha,
+                       &gemm_beta,
+                       (T*)weight_out.data_ptr(),
+                       (T*)intermediate.data_ptr(),
+                       (T*)output.data_ptr(),
 #ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
+                       rocblas_gemm_algo_standard);
 #else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+                       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
+    }
     // cudaEventRecord(Context::Instance().GetCompEvent(2),
     //                Context::Instance().GetCurrentStream(true));
     return output;
 }
 
-void residual_add_bias(at::Tensor& output,
-                       at::Tensor& input,
-                       at::Tensor& attention_output,
-                       at::Tensor& output_b,
-                       at::Tensor& attention_b,
-                       int mp_size,
-                       bool mlp_after_attn)
+template <typename T>
+at::Tensor& residual_add_bias(at::Tensor& hidden_state,
+                              at::Tensor& residual,
+                              const at::Tensor& attention_output,
+                              const at::Tensor& attention_bias,
+                              const at::Tensor& final_bias,
+                              const int mp_size,
+                              const bool mlp_after_attn,
+                              const bool add_bias,
+                              const bool preln)
 {
-    int bsz = input.size(0) * input.size(1);
-    int hidden_size = input.size(2);
-    // cudaStreamWaitEvent(
-    //    Context::Instance().GetCurrentStream(), Context::Instance().GetCompEvent(2), 0);
-    if (input.scalar_type() == at::kFloat)
-        if (mlp_after_attn)
-            launch_bias_residual((float*)input.data_ptr(),
-                                 (float*)output.data_ptr(),
-                                 (float*)attention_output.data_ptr(),
-                                 (float*)output_b.data_ptr(),
-                                 (float*)attention_b.data_ptr(),
-                                 bsz,
-                                 hidden_size,
-                                 mp_size,
-                                 Context::Instance().GetCurrentStream());
-        else
-            launch_gptj_residual_add<float>((float*)input.data_ptr(),
-                                            (float*)output.data_ptr(),
-                                            (float*)attention_output.data_ptr(),
-                                            (float*)output_b.data_ptr(),
-                                            (float*)attention_b.data_ptr(),
-                                            hidden_size,
-                                            bsz,
-                                            mp_size,
-                                            Context::Instance().GetCurrentStream());
-    else if (mlp_after_attn)
-        launch_bias_residual((__half*)input.data_ptr(),
-                             (__half*)output.data_ptr(),
-                             (__half*)attention_output.data_ptr(),
-                             (__half*)output_b.data_ptr(),
-                             (__half*)attention_b.data_ptr(),
+    int bsz = residual.size(0) * residual.size(1);
+    int hidden_size = residual.size(2);
+    if (mlp_after_attn)
+        launch_bias_residual(static_cast<T*>(residual.data_ptr()),
+                             static_cast<T*>(hidden_state.data_ptr()),
+                             static_cast<T*>(attention_output.data_ptr()),
+                             static_cast<T*>(final_bias.data_ptr()),
+                             static_cast<T*>(attention_bias.data_ptr()),
                              bsz,
                              hidden_size,
                              mp_size,
+                             preln,
                              Context::Instance().GetCurrentStream());
     else
-        launch_gptj_residual_add<__half>((__half*)input.data_ptr(),
-                                         (__half*)output.data_ptr(),
-                                         (__half*)attention_output.data_ptr(),
-                                         (__half*)output_b.data_ptr(),
-                                         (__half*)attention_b.data_ptr(),
-                                         hidden_size,
-                                         bsz,
-                                         mp_size,
-                                         Context::Instance().GetCurrentStream());
+        launch_gptj_residual_add<T>(
+            static_cast<T*>(residual.data_ptr()),
+            static_cast<T*>(hidden_state.data_ptr()),
+            static_cast<T*>(attention_output.data_ptr()),
+            static_cast<T*>(final_bias.data_ptr()),
+            static_cast<T*>((add_bias ? attention_bias.data_ptr() : nullptr)),
+            hidden_size,
+            bsz,
+            mp_size,
+            Context::Instance().GetCurrentStream());
+    return residual;
 }
 
 std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query,
@@ -832,7 +1627,8 @@ std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query,
                                            bsz,
                                            rotate_half,
                                            rotate_every_two,
-                                           Context::Instance().GetCurrentStream());
+                                           Context::Instance().GetCurrentStream(),
+                                           Context::Instance().GetMaxTokenLenght());
     else
         launch_apply_rotary_pos_emb<__half>((__half*)query_cont.data_ptr(),
                                             (__half*)key_cont.data_ptr(),
@@ -844,7 +1640,8 @@ std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query,
                                             bsz,
                                             rotate_half,
                                             rotate_every_two,
-                                            Context::Instance().GetCurrentStream());
+                                            Context::Instance().GetCurrentStream(),
+                                            Context::Instance().GetMaxTokenLenght());
     return {query_cont, key_cont};
 }
 
@@ -904,22 +1701,34 @@ at::Tensor moe_res_matmul(at::Tensor& moe_res, at::Tensor& coef, at::Tensor& out
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("softmax_fp32", &ds_softmax<float>, "DeepSpeed SoftMax with fp32 (CUDA)");
-    m.def("softmax_fp16", &ds_softmax<__half>, "DeepSpeed SoftMax with fp32 (CUDA)");
+    m.def("softmax_fp16", &ds_softmax<__half>, "DeepSpeed SoftMax with fp16 (CUDA)");
     m.def(
         "softmax_context_fp32", &ds_softmax_context<float>, "DeepSpeed attention with fp32 (CUDA)");
     m.def("softmax_context_fp16",
           &ds_softmax_context<__half>,
-          "DeepSpeed attention with fp32 (CUDA)");
+          "DeepSpeed attention with fp16 (CUDA)");
+    m.def("softmax_context_int8",
+          &ds_softmax_context1<__half>,
+          "DeepSpeed attention with int8 (CUDA)");
     m.def("bias_gelu_fp32", &ds_bias_gelu<float>, "DeepSpeed Gelu with fp32 (CUDA)");
-    m.def("bias_gelu_fp16", &ds_bias_gelu<__half>, "DeepSpeed Gelu with fp32 (CUDA)");
+    m.def("bias_gelu_fp16", &ds_bias_gelu<__half>, "DeepSpeed Gelu with fp16 (CUDA)");
+    m.def("bias_geglu", &ds_bias_geglu, "DeepSpeed Bias GEGLU (CUDA)");
+    m.def("bias_add_fp32", &ds_bias_add<float>, "DeepSpeed Bias Add with fp32 (CUDA)");
+    m.def("bias_add_fp16", &ds_bias_add<__half>, "DeepSpeed Gelu with fp16 (CUDA)");
+    m.def("bias_relu_fp32", &ds_bias_relu<float>, "DeepSpeed ReLU with fp32 (CUDA)");
+    m.def("bias_relu_fp16", &ds_bias_relu<__half>, "DeepSpeed ReLU with fp16 (CUDA)");
     m.def("bias_residual_fp32",
           &ds_bias_residual<float>,
           "DeepSpeed residual-bias add with fp32 (CUDA)");
     m.def("bias_residual_fp16",
           &ds_bias_residual<__half>,
-          "DeepSpeed residual-bias add with fp32 (CUDA)");
-    m.def("layer_norm_fp32", &ds_layernorm<float>, "DeepSpeed layer-norm with fp32 (CUDA)");
-    m.def("layer_norm_fp16", &ds_layernorm<__half>, "DeepSpeed layer-norm with fp16 (CUDA)");
+          "DeepSpeed residual-bias add with fp16 (CUDA)");
+    m.def("layer_norm", &ds_layer_norm, "DeepSpeed layer norm (CUDA)");
+    m.def(
+        "_layer_norm_residual", &ds_layer_norm_residual, "DeepSpeed layer norm + residual (CUDA)");
+    m.def("layer_norm_residual_store_pre_ln_res",
+          &ds_layer_norm_residual_store_pre_ln_res,
+          "DeepSpeed layer norm + store pre Layernorm residual (CUDA)");
     m.def("qkv_gemm_fp32", &ds_qkv_gemm<float>, "DeepSpeed qkv gemm with fp32 (CUDA)");
     m.def("qkv_gemm_fp16", &ds_qkv_gemm<__half>, "DeepSpeed qkv gemm with fp16 (CUDA)");
     m.def("qkv_gemm_int8", &ds_qkv_gemm_int8<__half>, "DeepSpeed qkv gemm with int8 (CUDA)");
@@ -938,7 +1747,12 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
           "DeepSpeed linear_layer with int8 (CUDA)");
     m.def("fused_gemm_gelu_fp32", &fused_gemm_gelu<float>, "DeepSpeed mlp with fp32 (CUDA)");
     m.def("fused_gemm_gelu_fp16", &fused_gemm_gelu<__half>, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("residual_add", &residual_add_bias, "DeepSpeed mlp with fp16 (CUDA)");
+    m.def("residual_add_bias_fp32",
+          &residual_add_bias<float>,
+          "DeepSpeed residual add with fp32 (CUDA)");
+    m.def("residual_add_bias_fp16",
+          &residual_add_bias<__half>,
+          "DeepSpeed residual add with fp16 (CUDA)");
     m.def("apply_rotary_pos_emb", &apply_rotary_pos_emb, "DeepSpeed mlp with fp16 (CUDA)");
     m.def("einsum_sec_sm_ecm_fp32",
           &einsum_sec_sm_ecm<float>,
@@ -948,4 +1762,19 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
           &einsum_sec_sm_ecm<__half>,
           "DeepSpeed vector-MM with fp16 (CUDA)");
     m.def("moe_res_matmul", &moe_res_matmul, "DeepSpeed moe residual matmul (CUDA)");
+    m.def("add_padding_fp32", &add_padding<float>, "DeepSpeed residual add with fp32 (CUDA)");
+    m.def("add_padding_fp16", &add_padding<__half>, "DeepSpeed residual add with fp16 (CUDA)");
+    m.def("pad_transform_fp32",
+          &padd_add_transform<float>,
+          "DeepSpeed residual add with fp32 (CUDA)");
+    m.def("pad_transform_fp16",
+          &padd_add_transform<__half>,
+          "DeepSpeed residual add with fp16 (CUDA)");
+    m.def("allocate_workspace_fp32",
+          &allocate_workspace<float>,
+          "DeepSpeed memory allocation for GPT inference with fp32 (CUDA)");
+    m.def("allocate_workspace_fp16",
+          &allocate_workspace<__half>,
+          "DeepSpeed memory allocation for GPT inference with fp16 (CUDA)");
+    m.def("reset_cache", &reset_cache, "Reset Cache for generation tasks");
 }
diff --git a/csrc/transformer/inference/csrc/pt_binding_hip.cpp b/csrc/transformer/inference/csrc/pt_binding_hip.cpp
index 6fed126f2c360dd3eec0ce9831b200acce3cd9d9..a85291cb730dfec5315de33c746623f74c4785c0 100644
--- a/csrc/transformer/inference/csrc/pt_binding_hip.cpp
+++ b/csrc/transformer/inference/csrc/pt_binding_hip.cpp
@@ -1,40 +1,90 @@
 // !!! This is a file automatically generated by hipify!!!
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
 
-#include <ATen/hip/HIPContext.h>
+#include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
 #include <torch/extension.h>
+#include <stdexcept>
 #include <vector>
-#include "context_hip.h"
-#include "cublas_wrappers_hip.h"
-#include "custom_hip_layers.h"
+#include "inference_context.h"
+#include "inference_cublas_wrappers.h"
+#include "inference_cuda_layers.h"
 
 std::array<int, 3> gemm_algos = std::array<int, 3>({99, 99, 99});
 
-#define MAX_OUT_TOKES 10
+// NOTE: This activation function type enum should be always in sync
+// with the python counterpart, otherwise the casting from python binding
+// will be incorrect.
+enum class ActivationFuncType { UNKNOWN = 0, GELU = 1, ReLU = 2 };
+
+enum class TransformerType : uint8_t { UNKNOWN = 0, GPTType = 1, BERTType = 2 };
+
+// NOTE: this is a temporary and dodgy solution to distinguish GPT and BERT style models
+// based on the dimensions of the corresponding attention mask.
+inline auto infer_transformer_type(at::Tensor& attn_mask) -> TransformerType
+{
+    auto attn_mask_num_dims = attn_mask.sizes().size();
+
+    if (attn_mask_num_dims > 2) {
+        return TransformerType::GPTType;
+    } else if (attn_mask_num_dims == 2) {
+        return TransformerType::BERTType;
+    } else {
+        return TransformerType::UNKNOWN;
+    }
+}
+
+// infer stride of attention mask memory layout based on the model type.
+inline auto get_attn_mask_stride(at::Tensor& attn_mask) -> int
+{
+    auto trnsfrmr_type = infer_transformer_type(attn_mask);
+
+    if (trnsfrmr_type == TransformerType::GPTType) {
+        return attn_mask.size(2);
+    } else if (trnsfrmr_type == TransformerType::BERTType) {
+        // Bert style models have always a mask stride of 1.
+        return 1;
+    } else if (trnsfrmr_type == TransformerType::UNKNOWN) {
+        return 0;
+    }
+
+    // this is just to make the compiler happy.
+    return 0;
+}
 
 template <typename T>
 at::Tensor ds_softmax(at::Tensor& attn_scores,
                       at::Tensor& attn_mask,
+                      at::Tensor& alibi,
                       bool triangular,
                       bool recompute,
                       bool local_attention,
                       int window_size,
-                      bool async_op)
+                      bool async_op,
+                      float layer_scale,
+                      int head_offset,
+                      int mp_size)
 {
     auto attn_scores_c = attn_scores.contiguous();
     int bsz = attn_scores_c.size(0);
 
     int seq_len = attn_scores_c.size(1);
     int len = attn_scores_c.sizes().size();
-    if (len > 3) seq_len = attn_scores_c.size(2);
+    if (len > 2) seq_len = attn_scores_c.size(2);
 
     int soft_len = attn_scores_c.size(2);
     if (len > 3) soft_len = attn_scores_c.size(3);
 
     int heads = 1;
-    if (len > 3) heads = attn_scores_c.size(1);
+    if (len > 1) heads = attn_scores_c.size(1);
+
+    auto mask_stride = get_attn_mask_stride(attn_mask);
 
     launch_attn_softmax_v2((T*)attn_scores_c.data_ptr(),
                            (attn_mask.sizes().size() > 1 ? (T*)attn_mask.data_ptr() : nullptr),
+                           (alibi.sizes().size() > 1 ? (T*)alibi.data_ptr() : nullptr),
+                           layer_scale,
                            triangular,
                            recompute,
                            local_attention,
@@ -43,20 +93,35 @@ at::Tensor ds_softmax(at::Tensor& attn_scores,
                            heads,
                            seq_len,
                            soft_len,
-                           1.0,
+                           head_offset,
+                           mask_stride,
+                           mp_size,
                            Context::Instance().GetCurrentStream(async_op));
 
     return attn_scores_c;
 }
 
 template <typename T>
-void allocate_workspace(size_t hidden_dim,
-                        size_t max_seq_len,
-                        size_t batch_size,
-                        size_t head_size = 128)
+void allocate_workspace(unsigned hidden_dim,
+                        unsigned num_heads,
+                        unsigned prompt_length,
+                        unsigned batch_size,
+                        unsigned num_layers,
+                        unsigned mp_size = 1,
+                        bool external_cache = false,
+                        unsigned rank = 0,
+                        unsigned max_out_tokens = 1024)
 {
-    size_t _workSpaceSize = (hidden_dim * batch_size * max_seq_len);
-    Context::Instance().GenWorkSpace(_workSpaceSize * sizeof(T));
+    Context::Instance().GenWorkSpace(num_layers,
+                                     num_heads,
+                                     batch_size,
+                                     prompt_length,
+                                     hidden_dim,
+                                     mp_size,
+                                     external_cache,
+                                     sizeof(T),
+                                     rank,
+                                     max_out_tokens);
 }
 
 template <typename T>
@@ -71,10 +136,13 @@ at::Tensor einsum_sec_sm_ecm(at::Tensor& Q, at::Tensor& W)
     float alpha = 1;
     float gemm_beta = 0.0;
 
-    if (!workspace) {
-        allocate_workspace<T>(W.size(1), MAX_OUT_TOKES, Q.size(0));
-        workspace = (T*)Context::Instance().GetWorkSpace();
+    /*
+    // Reallocate memory if we received a new prompt
+    if (!workspace || input.size(1) != 1) {
+        allocate_workspace<T>(W.size(1), Context::Instance().GetMaxTokenLenght(), Q.size(0), 1,
+    head_size); workspace = (T*)Context::Instance().GetWorkSpace();
     }
+    */
 
     auto O = at::from_blob(workspace, {Q.size(1), Q.size(2), W.size(1)}, options);
     unsigned m = W.size(1);
@@ -124,6 +192,9 @@ void attention_unfused(at::Tensor& prev_key_cont,
     float gemm_beta = 0.0;
     auto attn_score = at::empty({bsz, heads, seq_len, soft_len}, options);
     int k = prev_value_cont.size(2) / heads;
+
+    auto mask_stride = get_attn_mask_stride(attn_mask);
+
     rocblas_set_stream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
     cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
                                 soft_len,
@@ -145,8 +216,22 @@ void attention_unfused(at::Tensor& prev_key_cont,
 #else
                                 CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
-    attn_score = ds_softmax<T>(
-        attn_score, attn_mask, triangular, recompute, local_attention, window_size, false);
+    launch_attn_softmax_v2((T*)attn_score.data_ptr(),
+                           (T*)(attn_mask.sizes().size() > 1 ? attn_mask.data_ptr() : nullptr),
+                           (T*)nullptr,
+                           1.0,
+                           triangular,
+                           recompute,
+                           local_attention,
+                           window_size,
+                           bsz,
+                           heads,
+                           seq_len,
+                           soft_len,
+                           0,
+                           mask_stride,
+                           1,
+                           Context::Instance().GetCurrentStream(false));
     alpha = 1.0;
     cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
                                 k,
@@ -171,19 +256,19 @@ void attention_unfused(at::Tensor& prev_key_cont,
 }
 
 template <typename T>
-std::vector<at::Tensor> ds_softmax_context(at::Tensor& query,
-                                           at::Tensor& prev_key,
-                                           at::Tensor& new_key,
-                                           at::Tensor& attn_mask,
-                                           at::Tensor& prev_value,
-                                           at::Tensor& new_value,
-                                           int heads,
-                                           float norm_factor,
-                                           bool merging,
-                                           bool triangular,
-                                           bool local_attention,
-                                           int window_size,
-                                           bool no_masking)
+std::vector<at::Tensor> ds_softmax_context1(at::Tensor& query,
+                                            at::Tensor& prev_key,
+                                            at::Tensor& new_key,
+                                            at::Tensor& attn_mask,
+                                            at::Tensor& prev_value,
+                                            at::Tensor& new_value,
+                                            int heads,
+                                            float norm_factor,
+                                            bool merging,
+                                            bool triangular,
+                                            bool local_attention,
+                                            int window_size,
+                                            bool no_masking)
 {
     auto query_cont = query.contiguous();
     auto prev_key_cont = prev_key.contiguous();
@@ -223,6 +308,230 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query,
     return {output, prev_key, prev_value};
 }
 
+template <typename T>
+void ds_softmax_internal(T* attn_scores,
+                         at::Tensor& attn_mask,
+                         at::Tensor& alibi,
+                         float& layer_scale,
+                         bool triangular,
+                         bool recompute,
+                         bool local_attention,
+                         int window_size,
+                         int bsz,
+                         int seq_len,
+                         int soft_len,
+                         int heads)
+{
+    auto mask_stride = get_attn_mask_stride(attn_mask);
+
+    launch_attn_softmax_v2((T*)attn_scores,
+                           (attn_mask.sizes().size() > 1 ? (T*)attn_mask.data_ptr() : nullptr),
+                           (alibi.sizes().size() > 1 ? (T*)alibi.data_ptr() : nullptr),
+                           layer_scale,
+                           triangular,
+                           recompute,
+                           local_attention,
+                           window_size,
+                           bsz,
+                           heads,
+                           seq_len,
+                           soft_len,
+                           0,
+                           mask_stride,
+                           1,
+                           at::hip::getCurrentHIPStreamMasqueradingAsCUDA());
+}
+
+template <typename T>
+void attention_unfused(T* prev_key_cont,
+                       T* query_cont,
+                       at::Tensor& attn_mask,
+                       T* prev_value_cont,
+                       T* output,
+                       unsigned& bsz,
+                       int& k,
+                       unsigned& seq_len,
+                       unsigned& soft_len,
+                       int& heads,
+                       float& norm_factor,
+                       bool triangular,
+                       bool recompute,
+                       bool local_attention,
+                       int window_size,
+                       at::Tensor& alibi,
+                       int layer_id)
+{
+    float layer_scale = alibi.sizes().size() > 1 ? std::max(1, layer_id) : 1.0;
+    float alpha = norm_factor * norm_factor / layer_scale;
+    float gemm_beta = 0.0;
+    T* workspace = (T*)Context::Instance().GetAttentionUnfusedWorkspace();
+
+    rocblas_set_stream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
+    cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
+                                soft_len,
+                                seq_len,
+                                k,
+                                &alpha,
+                                &gemm_beta,
+                                (T*)prev_key_cont,
+                                (T*)query_cont,
+                                workspace,
+                                rocblas_operation_transpose,
+                                rocblas_operation_none,
+                                Context::Instance().GetMaxTokenLenght() * k,
+                                seq_len * k,
+                                seq_len * soft_len,
+                                bsz * heads,
+#ifdef __HIP_PLATFORM_HCC__
+                                rocblas_gemm_algo_standard);
+#else
+                                CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
+    ds_softmax_internal<T>(workspace,
+                           attn_mask,
+                           alibi,
+                           layer_scale,
+                           triangular,
+                           recompute,
+                           local_attention,
+                           window_size,
+                           bsz,
+                           seq_len,
+                           soft_len,
+                           heads);
+    alpha = 1.0;
+    cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
+                                k,
+                                seq_len,
+                                soft_len,
+                                &alpha,
+                                &gemm_beta,
+                                (T*)prev_value_cont,
+                                workspace,
+                                (T*)output,
+                                rocblas_operation_none,
+                                rocblas_operation_none,
+                                Context::Instance().GetMaxTokenLenght() * k,
+                                seq_len * soft_len,
+                                seq_len * k,
+                                bsz * heads,
+#ifdef __HIP_PLATFORM_HCC__
+                                rocblas_gemm_algo_standard);
+#else
+                                CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
+}
+
+void reset_cache() { Context::Instance().reset_tokens(); }
+
+template <typename T>
+std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
+                                           at::Tensor& attn_mask,
+                                           int rotary_dim,
+                                           bool rotate_half,
+                                           bool rotate_every_two,
+                                           int heads,
+                                           float norm_factor,
+                                           bool triangular,
+                                           bool local_attention,
+                                           int window_size,
+                                           bool no_masking,
+                                           unsigned layer_id,
+                                           unsigned num_layers,
+                                           at::Tensor& alibi)
+{
+    unsigned bsz = query_key_value.size(0);
+    unsigned seq_len = query_key_value.size(1);
+    unsigned hidden_dim = query_key_value.size(2) / 3;
+
+    bool is_prompt = (seq_len > 1);
+
+    if (is_prompt) Context::Instance().reset_tokens(seq_len);
+    unsigned soft_len = Context::Instance().current_tokens();
+
+    int k = hidden_dim / heads;
+    auto options = at::TensorOptions()
+                       .dtype(query_key_value.options().dtype())
+                       .layout(at::kStrided)
+                       .device(at::kCUDA)
+                       .requires_grad(false);
+
+    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    size_t buf_size = bsz * seq_len * hidden_dim;
+    auto output = torch::from_blob(workspace + 4 * buf_size, {bsz, seq_len, hidden_dim}, options);
+
+    auto query_cont = workspace + 8 * buf_size;
+    size_t offset = 16 * (hidden_dim * bsz * Context::Instance().GetMaxTokenLenght()) +
+                    layer_id * 2 * bsz * Context::Instance().GetMaxTokenLenght() * hidden_dim;
+    unsigned all_tokens = soft_len;
+    auto kv_cache = workspace + offset + (hidden_dim / heads) * (is_prompt ? 0 : soft_len - 1);
+    size_t value_offset = bsz * Context::Instance().GetMaxTokenLenght() * hidden_dim;
+
+    T* temp_buf = (T*)output.data_ptr() + at::numel(output);
+    launch_bias_add_transform_0213<T>((T*)query_cont,
+                                      kv_cache,
+                                      kv_cache + value_offset,
+                                      (T*)query_key_value.data_ptr(),
+                                      nullptr,
+                                      bsz,
+                                      seq_len,
+                                      (is_prompt ? 0 : soft_len - 1),
+                                      soft_len,
+                                      hidden_dim,
+                                      heads,
+                                      rotary_dim,
+                                      rotate_half,
+                                      rotate_every_two,
+                                      Context::Instance().GetCurrentStream(),
+                                      3,
+                                      Context::Instance().GetMaxTokenLenght());
+    if (rotary_dim > 0 && rotate_half)
+        launch_apply_rotary_pos_emb(query_cont,
+                                    kv_cache,
+                                    k,
+                                    seq_len,
+                                    rotary_dim,
+                                    (is_prompt ? 0 : soft_len - 1),
+                                    heads,
+                                    bsz,
+                                    rotate_half,
+                                    rotate_every_two,
+                                    Context::Instance().GetCurrentStream(),
+                                    Context::Instance().GetMaxTokenLenght());
+
+    attention_unfused<T>(workspace + offset,
+                         (T*)query_cont,
+                         attn_mask,
+                         workspace + offset + value_offset,
+                         temp_buf,
+                         bsz,
+                         k,
+                         seq_len,
+                         all_tokens,
+                         heads,
+                         norm_factor,
+                         (triangular && is_prompt),
+                         is_prompt,
+                         local_attention,
+                         window_size,
+                         alibi,
+                         layer_id);
+    launch_transform4d_0213<T>((T*)output.data_ptr(),
+                               temp_buf,
+                               bsz,
+                               heads,
+                               seq_len,
+                               output.size(2),
+                               Context::Instance().GetCurrentStream(false),
+                               1);
+
+    if (layer_id == num_layers - 1) Context::Instance().advance_tokens();
+    auto prev_key = torch::from_blob(workspace + offset, {bsz, heads, all_tokens, k}, options);
+    auto prev_value =
+        torch::from_blob(workspace + offset + value_offset, {bsz, heads, all_tokens, k}, options);
+    return {output, prev_key, prev_value};
+}
+
 template <typename T>
 at::Tensor ds_bias_gelu(at::Tensor& input, at::Tensor& bias)
 {
@@ -239,6 +548,73 @@ at::Tensor ds_bias_gelu(at::Tensor& input, at::Tensor& bias)
     return input_cont;
 }
 
+at::Tensor ds_bias_geglu(at::Tensor& activation, at::Tensor& bias)
+{
+    /*
+    Used in FF of Stable diffusion
+    */
+
+    const int batch_size = activation.size(0);
+    const int seq_len = activation.size(1);
+    const int channels = activation.size(2);
+
+    const int rows = batch_size * seq_len;
+    // Dimensionality is cut in half
+    const int out_channels = channels / 2;
+
+    auto output = at::empty({batch_size, seq_len, out_channels}, activation.options());
+
+    if (activation.options().dtype() == torch::kFloat32) {
+        launch_fused_bias_geglu((float*)output.data_ptr(),
+                                (const float*)activation.data_ptr(),
+                                (const float*)bias.data_ptr(),
+                                rows,
+                                channels,
+                                Context::Instance().GetCurrentStream());
+    } else {
+        launch_fused_bias_geglu((__half*)output.data_ptr(),
+                                (const __half*)activation.data_ptr(),
+                                (const __half*)bias.data_ptr(),
+                                rows,
+                                channels,
+                                Context::Instance().GetCurrentStream());
+    }
+
+    return output;
+}
+
+template <typename T>
+at::Tensor ds_bias_relu(at::Tensor& input, at::Tensor& bias)
+{
+    auto input_cont = input.contiguous();
+
+    int bsz = input_cont.size(0) * input_cont.size(1);
+    int intermediate_size = input_cont.size(2);
+
+    launch_bias_relu((T*)input_cont.data_ptr(),
+                     (T*)bias.data_ptr(),
+                     intermediate_size,
+                     bsz,
+                     Context::Instance().GetCurrentStream());
+    return input_cont;
+}
+
+template <typename T>
+at::Tensor ds_bias_add(at::Tensor& input, at::Tensor& bias)
+{
+    auto input_cont = input.contiguous();
+
+    int bsz = input_cont.size(0) * input_cont.size(1);
+    int hidden_size = input_cont.size(2);
+
+    launch_bias_add((T*)input_cont.data_ptr(),
+                    (T*)bias.data_ptr(),
+                    hidden_size,
+                    bsz,
+                    Context::Instance().GetCurrentStream());
+    return input_cont;
+}
+
 template <typename T>
 at::Tensor ds_bias_residual(at::Tensor& input, at::Tensor& residual, at::Tensor& bias)
 {
@@ -256,85 +632,260 @@ at::Tensor ds_bias_residual(at::Tensor& input, at::Tensor& residual, at::Tensor&
     return input_cont;
 }
 
+at::Tensor ds_layer_norm(at::Tensor& input, at::Tensor& gamma, at::Tensor& beta, float epsilon)
+{
+    const int rows = input.size(0) * input.size(1);
+    const int elems_per_row = input.size(2);
+    auto output = at::empty_like(input);
+
+    if (input.options().dtype() == torch::kFloat16) {
+        launch_fused_ln((__half*)output.data_ptr(),
+                        (const __half*)input.data_ptr(),
+                        (const __half*)gamma.data_ptr(),
+                        (const __half*)beta.data_ptr(),
+                        epsilon,
+                        rows,
+                        elems_per_row,
+                        Context::Instance().GetCurrentStream());
+    } else {
+        launch_fused_ln((float*)output.data_ptr(),
+                        (const float*)input.data_ptr(),
+                        (const float*)gamma.data_ptr(),
+                        (const float*)beta.data_ptr(),
+                        epsilon,
+                        rows,
+                        elems_per_row,
+                        Context::Instance().GetCurrentStream());
+    }
+
+    return output;
+}
+
 template <typename T>
-at::Tensor ds_layernorm(at::Tensor& input_cont, at::Tensor& gamma, at::Tensor& betta, float epsilon)
+void ds_layer_norm_internal(T* workspace,
+                            at::Tensor& input,
+                            at::Tensor& gamma,
+                            at::Tensor& beta,
+                            float epsilon)
 {
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    auto inp_norm = at::empty_like(input_cont);
-    launch_layer_norm((T*)inp_norm.data_ptr(),
-                      (T*)input_cont.data_ptr(),
-                      (T*)gamma.data_ptr(),
-                      (T*)betta.data_ptr(),
-                      epsilon,
-                      bsz,
-                      input_cont.size(2),
-                      Context::Instance().GetCurrentStream());
-    return inp_norm;
+    int bsz = input.size(0) * input.size(1);
+    launch_fused_ln(workspace,
+                    (const T*)input.data_ptr(),
+                    (const T*)gamma.data_ptr(),
+                    (const T*)beta.data_ptr(),
+                    epsilon,
+                    bsz,
+                    input.size(2),
+                    Context::Instance().GetCurrentStream());
+}
+
+/* Currently only used in unit testing */
+at::Tensor ds_layer_norm_residual(at::Tensor& input,
+                                  at::Tensor& bias,
+                                  at::Tensor& residual,
+                                  at::Tensor& gamma,
+                                  at::Tensor& beta,
+                                  float epsilon)
+{
+    const int rows = input.size(0) * input.size(1);
+    const int elems_per_row = input.size(2);
+    auto output = at::empty_like(input);
+
+    if (input.options().dtype() == torch::kFloat16) {
+        launch_fused_residual_ln((__half*)output.data_ptr(),
+                                 (const __half*)input.data_ptr(),
+                                 (const __half*)residual.data_ptr(),
+                                 (const __half*)bias.data_ptr(),
+                                 (const __half*)gamma.data_ptr(),
+                                 (const __half*)beta.data_ptr(),
+                                 epsilon,
+                                 rows,
+                                 elems_per_row,
+                                 Context::Instance().GetCurrentStream());
+    } else {
+        launch_fused_residual_ln((float*)output.data_ptr(),
+                                 (const float*)input.data_ptr(),
+                                 (const float*)residual.data_ptr(),
+                                 (const float*)bias.data_ptr(),
+                                 (const float*)gamma.data_ptr(),
+                                 (const float*)beta.data_ptr(),
+                                 epsilon,
+                                 rows,
+                                 elems_per_row,
+                                 Context::Instance().GetCurrentStream());
+    }
+
+    return output;
+}
+
+/* Currently only used in unit testing */
+std::vector<at::Tensor> ds_layer_norm_residual_store_pre_ln_res(at::Tensor& input,
+                                                                at::Tensor& bias,
+                                                                at::Tensor& residual,
+                                                                at::Tensor& gamma,
+                                                                at::Tensor& beta,
+                                                                float epsilon)
+{
+    const int rows = input.size(0) * input.size(1);
+    const int elems_per_row = input.size(2);
+    auto norm_output = at::empty_like(input);
+    auto res_output = at::empty_like(input);
+
+    if (input.options().dtype() == torch::kFloat16) {
+        launch_fused_residual_ln_store_pre_ln_res((__half*)norm_output.data_ptr(),
+                                                  (__half*)res_output.data_ptr(),
+                                                  (const __half*)input.data_ptr(),
+                                                  (const __half*)residual.data_ptr(),
+                                                  (const __half*)bias.data_ptr(),
+                                                  (const __half*)gamma.data_ptr(),
+                                                  (const __half*)beta.data_ptr(),
+                                                  epsilon,
+                                                  rows,
+                                                  elems_per_row,
+                                                  Context::Instance().GetCurrentStream());
+    } else {
+        launch_fused_residual_ln_store_pre_ln_res((float*)norm_output.data_ptr(),
+                                                  (float*)res_output.data_ptr(),
+                                                  (const float*)input.data_ptr(),
+                                                  (const float*)residual.data_ptr(),
+                                                  (const float*)bias.data_ptr(),
+                                                  (const float*)gamma.data_ptr(),
+                                                  (const float*)beta.data_ptr(),
+                                                  epsilon,
+                                                  rows,
+                                                  elems_per_row,
+                                                  Context::Instance().GetCurrentStream());
+    }
+
+    return {norm_output, res_output};
 }
 
 template <typename T>
-at::Tensor qkv_unfused_cublas(at::Tensor& output,
-                              at::Tensor& input,
-                              at::Tensor& weight,
-                              at::Tensor& bias,
-                              at::Tensor& gamma,
-                              at::Tensor& beta,
-                              const float epsilon,
-                              bool add_bias)
+void quantized_gemm(void* output,
+                    T* input,
+                    at::Tensor& weight,
+                    at::Tensor& qscale,
+                    int groups,
+                    int bsz,
+                    int hidden_size)
 {
-    auto inp_norm = ds_layernorm<T>(input, gamma, beta, epsilon);
+    // T* weight16 = (T*)Context::Instance().GetWorkSpace() + 12 * hidden_size * bsz;
 
-    // hipEventRecord(Context::Instance().GetCompEvent(1), Context::Instance().GetCurrentStream());
+    auto options = at::TensorOptions()
+                       .dtype(at::kHalf)
+                       .layout(at::kStrided)
+                       .device(at::kCUDA)
+                       .requires_grad(false);
+    auto tmp = torch::empty(weight.sizes(), options);
+    T* weight16 = (T*)tmp.data_ptr();
+    launch_dequantize(weight16,
+                      (int8_t*)weight.data_ptr(),
+                      (float*)qscale.data_ptr(),
+                      weight.size(0),
+                      weight.size(1),
+                      groups,
+                      Context::Instance().GetCurrentStream());
 
     float alpha = (T)1.0;
     float gemm_beta = (T)0.0;
-    int bsz = input.size(0) * input.size(1);
-    rocblas_set_stream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
     cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+                   rocblas_operation_transpose,
                    rocblas_operation_none,
-                   rocblas_operation_none,
-                   weight.size(1),
+                   weight.size(0),
                    bsz,
-                   input.size(2),
+                   weight.size(1),
                    &alpha,
                    &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)inp_norm.data_ptr(),
-                   (T*)output.data_ptr(),
+                   weight16,
+                   (T*)input,
+                   (T*)output,
 #ifdef __HIP_PLATFORM_HCC__
                    rocblas_gemm_algo_standard);
 #else
                    CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
+}
+
+template <typename T>
+at::Tensor qkv_unfused_cublas(at::Tensor& output,
+                              at::Tensor& input,
+                              at::Tensor& weight,
+                              at::Tensor& q_scale,
+                              at::Tensor& bias,
+                              at::Tensor& gamma,
+                              at::Tensor& beta,
+                              const float epsilon,
+                              bool add_bias,
+                              bool q_int8)
+{
+    int bsz = input.size(0) * input.size(1);
+    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    workspace += (3 * bsz * input.size(2));
+    ds_layer_norm_internal<T>(workspace, input, gamma, beta, epsilon);
+
+    if (q_int8) {
+        quantized_gemm<T>(
+            output.data_ptr(), workspace, weight, q_scale, q_scale.size(0), bsz, input.size(2));
+    } else {
+        float alpha = (T)1.0;
+        float gemm_beta = (T)0.0;
+
+        rocblas_set_stream(Context::Instance().GetCublasHandle(),
+                        Context::Instance().GetCurrentStream());
+        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+                       rocblas_operation_none,
+                       rocblas_operation_none,
+                       weight.size(1),
+                       bsz,
+                       input.size(2),
+                       &alpha,
+                       &gemm_beta,
+                       (T*)weight.data_ptr(),
+                       workspace,
+                       (T*)output.data_ptr(),
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo_standard);
+#else
+                       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
+    }
     if (add_bias)
         launch_bias_add((T*)output.data_ptr(),
                         (T*)bias.data_ptr(),
-                        weight.size(1),
+                        q_int8 ? weight.size(0) : weight.size(1),
                         bsz,
                         Context::Instance().GetCurrentStream());
-    return inp_norm;
+    return torch::from_blob(workspace, input.sizes(), input.options());
 }
 
 template <typename T>
 std::vector<at::Tensor> ds_qkv_gemm(at::Tensor& input,
                                     at::Tensor& weight,
+                                    at::Tensor& q_scale,
                                     at::Tensor& bias,
                                     at::Tensor& gamma,
                                     at::Tensor& beta,
                                     const float epsilon,
-                                    bool add_bias)
+                                    bool add_bias,
+                                    unsigned num_layers,
+                                    bool external_cache,
+                                    unsigned mp_size,
+                                    unsigned rank,
+                                    bool q_int8)
 {
-    auto input_cont = input.contiguous();
+    int bsz = input.size(0) * input.size(1);
+    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    int out_size = q_int8 ? weight.size(0) : weight.size(1);
+
     auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
+                       .dtype(input.options().dtype())
                        .layout(at::kStrided)
                        .device(at::kCUDA)
                        .requires_grad(false);
 
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    auto inp_norm =
-        qkv_unfused_cublas<T>(output, input_cont, weight, bias, gamma, beta, epsilon, add_bias);
+    auto output = at::from_blob(workspace, {input.size(0), input.size(1), out_size}, options);
+    auto inp_norm = qkv_unfused_cublas<T>(
+        output, input, weight, q_scale, bias, gamma, beta, epsilon, add_bias, q_int8);
 
     return {output, inp_norm};
 }
@@ -358,20 +909,18 @@ void quantized_gemm(at::Tensor& output,
     launch_dequantize((T*)weight16.data_ptr(),
                       (int8_t*)weight.data_ptr(),
                       (float*)qscale.data_ptr(),
-                      weight.size(1),
                       weight.size(0),
+                      weight.size(1),
                       groups,
                       merge_count,
                       Context::Instance().GetCurrentStream());
 
-    rocblas_set_stream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-
     float alpha = (T)1.0;
     float gemm_beta = (T)0.0;
     cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+                   rocblas_operation_transpose,
                    rocblas_operation_none,
-                   rocblas_operation_none,
-                   weight.size(1),
+                   weight.size(0),
                    bsz,
                    input.size(2),
                    &alpha,
@@ -407,7 +956,7 @@ at::Tensor ds_qkv_gemm_int8(at::Tensor& input,
 
     auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
 
-    auto inp_norm = ds_layernorm<T>(input_cont, gamma, beta, epsilon);
+    auto inp_norm = ds_layer_norm(input_cont, gamma, beta, epsilon);
 
     quantized_gemm<T>(output, inp_norm, weight, q_scale, groups, 0);
     if (add_bias)
@@ -421,7 +970,12 @@ at::Tensor ds_qkv_gemm_int8(at::Tensor& input,
 }
 
 template <typename T>
-at::Tensor ds_linear_layer(at::Tensor& input, at::Tensor& weight, at::Tensor& bias)
+at::Tensor ds_linear_layer(at::Tensor& input,
+                           at::Tensor& weight,
+                           at::Tensor& bias,
+                           bool add_bias,
+                           bool do_flash_attn,
+                           int num_heads)
 {
     auto input_cont = input.contiguous();
     auto options = at::TensorOptions()
@@ -430,8 +984,10 @@ at::Tensor ds_linear_layer(at::Tensor& input, at::Tensor& weight, at::Tensor& bi
                        .device(at::kCUDA)
                        .requires_grad(false);
 
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
+    int head_size = input_cont.size(2) / num_heads;
+    int bsz = input.size(0) * input.size(1);
+    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    auto output = at::from_blob(workspace, {input.size(0), input.size(1), weight.size(1)}, options);
 
     float alpha = (T)1.0;
     float gemm_beta = (T)0.0;
@@ -453,16 +1009,172 @@ at::Tensor ds_linear_layer(at::Tensor& input, at::Tensor& weight, at::Tensor& bi
 #else
                    CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
+    if (add_bias)
+        launch_bias_add((T*)output.data_ptr(),
+                        (T*)bias.data_ptr(),
+                        weight.size(1),
+                        bsz,
+                        Context::Instance().GetCurrentStream());
+    bool add_padding = (head_size % 32 != 0 && head_size < 64) || (head_size % 64 != 0);
+    if (do_flash_attn) {
+        if (add_padding) {
+            int padded_head_size = head_size < 32 ? 32 : (head_size < 64 ? 64 : 128);
+            auto padded_output = workspace + output.numel();
+            auto final_output =
+                padded_output + (input.size(0) * input.size(1) * 3 * num_heads * padded_head_size);
+            pad_data(padded_output,
+                     workspace,
+                     3 * bsz * num_heads,
+                     head_size,
+                     padded_head_size,
+                     Context::Instance().GetCurrentStream());
 
-    launch_bias_add((T*)output.data_ptr(),
-                    (T*)bias.data_ptr(),
-                    weight.size(1),
-                    bsz,
-                    Context::Instance().GetCurrentStream());
+            launch_bias_add_transform_0213<T>(
+                final_output,
+                final_output + (input.size(0) * input.size(1) * num_heads * padded_head_size),
+                final_output + (input.size(0) * input.size(1) * 2 * num_heads * padded_head_size),
+                padded_output,
+                nullptr,
+                input.size(0),
+                input.size(1),
+                0,
+                input.size(1),
+                (num_heads * padded_head_size),
+                num_heads,
+                -1,
+                false,
+                false,
+                Context::Instance().GetCurrentStream(),
+                3,
+                input.size(1));
+            return at::from_blob(final_output,
+                                 {3, input.size(0), num_heads, input.size(1), padded_head_size},
+                                 options);
+            // return at::from_blob(padded_output, {input.size(0) * input.size(1), 3, num_heads,
+            // padded_head_size}, options);
+        } else {
+            auto final_output = workspace + output.numel();
+            launch_bias_add_transform_0213<T>(
+                final_output,
+                final_output + (input.size(0) * input.size(1) * input_cont.size(2)),
+                final_output + (input.size(0) * input.size(1) * 2 * input_cont.size(2)),
+                workspace,
+                nullptr,
+                input.size(0),
+                input.size(1),
+                0,
+                input.size(1),
+                input_cont.size(2),
+                num_heads,
+                -1,
+                false,
+                false,
+                Context::Instance().GetCurrentStream(),
+                3,
+                input.size(1));
+            return at::from_blob(
+                final_output, {3, input.size(0), num_heads, input.size(1), head_size}, options);
+            // return at::from_blob(workspace, {input.size(0) * input.size(1), 3, num_heads,
+            // head_size}, options);
+        }
+
+    } else
+        return output;
+}
 
-    return output;
+template <typename T>
+std::vector<at::Tensor> add_padding(at::Tensor& query, at::Tensor& key, at::Tensor& value)
+{
+    int head_size = query.size(3);
+    int padded_head_size = head_size < 32 ? 32 : (head_size < 64 ? 64 : 128);
+    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    T* key_pad_ptr = workspace + padded_head_size * query.size(0) * query.size(1) * query.size(2);
+    T* value_pad_ptr = key_pad_ptr + padded_head_size * query.size(0) * query.size(1) * 128;
+    pad_head_seq(workspace,
+                 (T*)query.data_ptr(),
+                 query.size(0) * query.size(1),
+                 query.size(2),
+                 query.size(2),
+                 head_size,
+                 padded_head_size,
+                 Context::Instance().GetCurrentStream());
+    pad_head_seq(key_pad_ptr,
+                 (T*)key.data_ptr(),
+                 query.size(0) * query.size(1),
+                 key.size(2),
+                 128,
+                 head_size,
+                 padded_head_size,
+                 Context::Instance().GetCurrentStream());
+    pad_head_seq(value_pad_ptr,
+                 (T*)value.data_ptr(),
+                 query.size(0) * query.size(1),
+                 key.size(2),
+                 128,
+                 head_size,
+                 padded_head_size,
+                 Context::Instance().GetCurrentStream());
+    return {
+        at::from_blob(workspace,
+                      {query.size(0), query.size(1), query.size(2), padded_head_size},
+                      query.options()),
+        at::from_blob(
+            key_pad_ptr, {query.size(0), query.size(1), 128, padded_head_size}, query.options()),
+        at::from_blob(
+            value_pad_ptr, {query.size(0), query.size(1), 128, padded_head_size}, query.options())};
 }
 
+template <typename T>
+std::vector<at::Tensor> padd_add_transform(at::Tensor& query,
+                                           at::Tensor& key,
+                                           at::Tensor& value,
+                                           int heads,
+                                           bool add_padding)
+{
+    int head_size = query.size(2) / heads;
+    int key_value_length = add_padding ? 128 : key.size(1);
+    int padded_head_size = add_padding ? (head_size < 32 ? 32 : (head_size < 64 ? 64 : 128))
+                                       : head_size;
+    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    T* key_pad_ptr = workspace + padded_head_size * query.size(0) * heads * query.size(1);
+    T* value_pad_ptr = key_pad_ptr + padded_head_size * query.size(0) * heads * key_value_length;
+    launch_pad_add_transform_0213(workspace,
+                                  (T*)query.data_ptr(),
+                                  query.size(0),
+                                  query.size(2),
+                                  query.size(1),
+                                  query.size(1),
+                                  heads,
+                                  padded_head_size,
+                                  Context::Instance().GetCurrentStream());
+    launch_pad_add_transform_0213(key_pad_ptr,
+                                  (T*)key.data_ptr(),
+                                  key.size(0),
+                                  key.size(2),
+                                  key.size(1),
+                                  key_value_length,
+                                  heads,
+                                  padded_head_size,
+                                  Context::Instance().GetCurrentStream());
+    launch_pad_add_transform_0213(value_pad_ptr,
+                                  (T*)value.data_ptr(),
+                                  value.size(0),
+                                  value.size(2),
+                                  value.size(1),
+                                  key_value_length,
+                                  heads,
+                                  padded_head_size,
+                                  Context::Instance().GetCurrentStream());
+    return {
+        at::from_blob(
+            workspace, {query.size(0), heads, query.size(1), padded_head_size}, query.options()),
+        at::from_blob(key_pad_ptr,
+                      {query.size(0), heads, key_value_length, padded_head_size},
+                      query.options()),
+        at::from_blob(value_pad_ptr,
+                      {query.size(0), heads, key_value_length, padded_head_size},
+                      query.options())};
+}
 template <typename T>
 at::Tensor ds_linear_layer_int8(at::Tensor& input,
                                 at::Tensor& weight,
@@ -490,37 +1202,52 @@ at::Tensor ds_linear_layer_int8(at::Tensor& input,
 }
 
 template <typename T>
-at::Tensor ds_vector_matmul(at::Tensor& input, at::Tensor& weight, bool async_op)
+at::Tensor ds_vector_matmul(at::Tensor& input,
+                            at::Tensor& weight,
+                            bool async_op,
+                            at::Tensor& q_scale,
+                            bool q_int8)
 {
-    auto input_cont = input.contiguous();
     auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
+                       .dtype(input.options().dtype())
                        .layout(at::kStrided)
                        .device(at::kCUDA)
                        .requires_grad(false);
+    int out_size = q_int8 ? weight.size(0) : weight.size(1);
+    int bsz = input.size(0) * input.size(1);
 
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    rocblas_set_stream(Context::Instance().GetCublasHandle(),
-                    Context::Instance().GetCurrentStream(async_op));
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   rocblas_operation_none,
-                   rocblas_operation_none,
-                   weight.size(1),
-                   bsz,
-                   input_cont.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)input_cont.data_ptr(),
-                   (T*)output.data_ptr(),
+    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    auto output = at::from_blob(workspace, {input.size(0), input.size(1), out_size}, options);
+    if (q_int8) {
+        quantized_gemm<T>(output.data_ptr(),
+                          (T*)input.data_ptr(),
+                          weight,
+                          q_scale,
+                          q_scale.size(0),
+                          bsz,
+                          input.size(2));
+    } else {
+        float alpha = (T)1.0;
+        float gemm_beta = (T)0.0;
+        rocblas_set_stream(Context::Instance().GetCublasHandle(),
+                        Context::Instance().GetCurrentStream(async_op));
+        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+                       rocblas_operation_none,
+                       rocblas_operation_none,
+                       weight.size(1),
+                       bsz,
+                       input.size(2),
+                       &alpha,
+                       &gemm_beta,
+                       (T*)weight.data_ptr(),
+                       (T*)input.data_ptr(),
+                       (T*)output.data_ptr(),
 #ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
+                       rocblas_gemm_algo_standard);
 #else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+                       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
+    }
     return output;
 }
 
@@ -545,95 +1272,163 @@ at::Tensor ds_vector_matmul_int8(at::Tensor& input,
 }
 
 template <typename T>
-void mlp_unfused_cublas(at::Tensor& output,
-                        at::Tensor& input,
-                        at::Tensor& residual,
-                        at::Tensor& input_bias,
-                        at::Tensor& weight,
-                        at::Tensor& bias,
-                        at::Tensor& gamma,
-                        at::Tensor& beta,
-                        const float epsilon,
-                        bool preLayerNorm,
-                        bool mlp_after_attn)
+at::Tensor mlp_unfused_cublas(at::Tensor& output,
+                              at::Tensor& input,
+                              at::Tensor& residual,
+                              at::Tensor& input_bias,
+                              at::Tensor& weight,
+                              at::Tensor& weight1,
+                              at::Tensor& bias,
+                              at::Tensor& gamma,
+                              at::Tensor& beta,
+                              const float epsilon,
+                              bool preLayerNorm,
+                              bool mlp_after_attn,
+                              at::Tensor& q_scale,
+                              at::Tensor& q_scale1,
+                              bool q_int8,
+                              ActivationFuncType act_func_type)
 {
     int bsz = input.size(0) * input.size(1);
-    auto inp_norm = at::empty_like(input);
-
-    launch_residual_layer_norm((T*)inp_norm.data_ptr(),
-                               (T*)nullptr,
-                               (T*)input.data_ptr(),
-                               (T*)residual.data_ptr(),
-                               (T*)input_bias.data_ptr(),
-                               (T*)gamma.data_ptr(),
-                               (T*)beta.data_ptr(),
-                               epsilon,
-                               bsz,
-                               input.size(2),
-                               preLayerNorm,
-                               mlp_after_attn,
-                               Context::Instance().GetCurrentStream());
+    T* inp_norm =
+        (T*)Context::Instance().GetWorkSpace() + torch::numel(input) + torch::numel(output);
+    T* intermediate = inp_norm + torch::numel(input);
+
+    if (mlp_after_attn) {
+        launch_fused_residual_ln((T*)inp_norm,
+                                 (const T*)input.data_ptr(),
+                                 (const T*)residual.data_ptr(),
+                                 (const T*)input_bias.data_ptr(),
+                                 (const T*)gamma.data_ptr(),
+                                 (const T*)beta.data_ptr(),
+                                 epsilon,
+                                 bsz,
+                                 input.size(2),
+                                 Context::Instance().GetCurrentStream());
+    } else {
+        ds_layer_norm_internal(inp_norm, input, gamma, beta, epsilon);
+    }
+    if (q_int8) {
+        quantized_gemm<T>(
+            intermediate, inp_norm, weight, q_scale, q_scale.size(0), bsz, input.size(2));
+    } else {
+        float alpha = (T)1.0;
+        float gemm_beta = (T)0.0;
+        rocblas_set_stream(Context::Instance().GetCublasHandle(),
+                        Context::Instance().GetCurrentStream());
+        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+                       rocblas_operation_none,
+                       rocblas_operation_none,
+                       weight.size(1),
+                       bsz,
+                       input.size(2),
+                       &alpha,
+                       &gemm_beta,
+                       (T*)weight.data_ptr(),
+                       inp_norm,
+                       intermediate,
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo_standard);
+#else
+                       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
+    }
+    if (act_func_type == ActivationFuncType::GELU) {
+        launch_bias_gelu(intermediate,
+                         (T*)bias.data_ptr(),
+                         q_int8 ? weight.size(0) : weight.size(1),
+                         bsz,
+                         Context::Instance().GetCurrentStream());
+    } else if (act_func_type == ActivationFuncType::ReLU) {
+        launch_bias_relu(intermediate,
+                         (T*)bias.data_ptr(),
+                         q_int8 ? weight.size(0) : weight.size(1),
+                         bsz,
+                         Context::Instance().GetCurrentStream());
+    }
 
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    rocblas_set_stream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   rocblas_operation_none,
-                   rocblas_operation_none,
-                   weight.size(1),
-                   bsz,
-                   input.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)inp_norm.data_ptr(),
-                   (T*)output.data_ptr(),
+    if (q_int8) {
+        quantized_gemm<T>(output.data_ptr(),
+                          intermediate,
+                          weight1,
+                          q_scale1,
+                          q_scale1.size(0),
+                          bsz,
+                          input.size(2));
+    } else {
+        float alpha = (T)1.0;
+        float gemm_beta = (T)0.0;
+        rocblas_set_stream(Context::Instance().GetCublasHandle(),
+                        Context::Instance().GetCurrentStream());
+        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+                       rocblas_operation_none,
+                       rocblas_operation_none,
+                       weight1.size(1),
+                       bsz,
+                       weight1.size(0),
+                       &alpha,
+                       &gemm_beta,
+                       (T*)weight1.data_ptr(),
+                       intermediate,
+                       (T*)output.data_ptr(),
 #ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
+                       rocblas_gemm_algo_standard);
 #else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+                       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
-    launch_bias_gelu((T*)output.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     weight.size(1),
-                     bsz,
-                     Context::Instance().GetCurrentStream());
+    }
+
+    return torch::from_blob(inp_norm, input.sizes(), input.options());
 }
+
 template <typename T>
-at::Tensor ds_mlp_gemm(at::Tensor& input,
-                       at::Tensor& residual,
-                       at::Tensor& input_bias,
-                       at::Tensor& weight,
-                       at::Tensor& bias,
-                       at::Tensor& gamma,
-                       at::Tensor& beta,
-                       const float epsilon,
-                       bool preLayerNorm,
-                       bool mlp_after_attn)
+std::vector<at::Tensor> ds_mlp_gemm(at::Tensor& input,
+                                    at::Tensor& residual,
+                                    at::Tensor& input_bias,
+                                    at::Tensor& weight_interm,
+                                    at::Tensor& weight_out,
+                                    at::Tensor& bias,
+                                    at::Tensor& gamma,
+                                    at::Tensor& beta,
+                                    const float epsilon,
+                                    bool preLayerNorm,
+                                    bool mlp_after_attn,
+                                    at::Tensor& q_scale,
+                                    at::Tensor& q_scale1,
+                                    bool q_int8,
+                                    int activation_type)
 {
-    auto input_cont = input.contiguous();
     auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
+                       .dtype(input.options().dtype())
                        .layout(at::kStrided)
                        .device(at::kCUDA)
                        .requires_grad(false);
 
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-
-    mlp_unfused_cublas<T>(output,
-                          mlp_after_attn ? input : residual,
-                          residual,
-                          input_bias,
-                          weight,
-                          bias,
-                          gamma,
-                          beta,
-                          epsilon,
-                          preLayerNorm,
-                          mlp_after_attn);
+    int out_size = q_int8 ? weight_out.size(0) : weight_out.size(1);
+    auto output = at::from_blob((T*)Context::Instance().GetWorkSpace() + torch::numel(input),
+                                {input.size(0), input.size(1), out_size},
+                                options);
+    int bsz = input.size(0) * input.size(1);
 
-    return output;
+    auto act_func_type = static_cast<ActivationFuncType>(activation_type);
+    auto res_add = mlp_unfused_cublas<T>(output,
+                                         mlp_after_attn ? input : residual,
+                                         residual,
+                                         input_bias,
+                                         weight_interm,
+                                         weight_out,
+                                         bias,
+                                         gamma,
+                                         beta,
+                                         epsilon,
+                                         preLayerNorm,
+                                         mlp_after_attn,
+                                         q_scale,
+                                         q_scale1,
+                                         q_int8,
+                                         act_func_type);
+
+    return {output, res_add};
 }
 
 template <typename T>
@@ -662,20 +1457,6 @@ std::vector<at::Tensor> ds_mlp_gemm_int8(at::Tensor& input,
     auto inp_norm = at::empty_like(input_cont);
 
     auto residual_add = (preLayerNorm ? at::empty_like(input_cont) : inp_norm);
-    // computing the blocking across K dimension
-    // launch_residual_layer_norm((T*)inp_norm.data_ptr(),
-    //                           (T*)residual_add.data_ptr(),
-    //                           (T*)input_cont.data_ptr(),
-    //                           (T*)residual.data_ptr(),
-    //                           (T*)input_bias.data_ptr(),
-    //                           (T*)gamma.data_ptr(),
-    //                           (T*)beta.data_ptr(),
-    //                           epsilon,
-    //                           bsz,
-    //                           input_cont.size(2),
-    //                           preLayerNorm,
-    //                           Context::Instance().GetCurrentStream());
-
     quantized_gemm<T>(output, inp_norm, weight, q_scale, groups, 0);
     launch_bias_gelu((T*)output.data_ptr(),
                      (T*)bias.data_ptr(),
@@ -689,122 +1470,136 @@ std::vector<at::Tensor> ds_mlp_gemm_int8(at::Tensor& input,
 template <typename T>
 at::Tensor fused_gemm_gelu(at::Tensor& input,
                            at::Tensor& weight,
+                           at::Tensor& weight_scale,
                            at::Tensor& bias,
                            at::Tensor& weight_out,
+                           at::Tensor& weight_out_scale,
                            const float epsilon,
                            bool preLayerNorm,
+                           bool q_int8,
                            bool async_op)
 {
-    auto input_cont = input.contiguous();
     auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
+                       .dtype(input.options().dtype())
                        .layout(at::kStrided)
                        .device(at::kCUDA)
                        .requires_grad(false);
 
-    auto intermediate =
-        at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight_out.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
+    int intm_dim = q_int8 ? weight.size(0) : weight.size(1);
+
+    // auto output = at::from_blob((T*)Context::Instance().GetWorkSpace() + torch::numel(input),
+    //                            {input.size(0), input.size(1), out_size},
+    //                            options);
+    // T* intermediate = (T*)input.data_ptr() + torch::numel(input);
+    auto intermediate = at::empty({input.size(0), input.size(1), intm_dim}, options);
+
+    int bsz = input.size(0) * input.size(1);
+
     float alpha = (T)1.0;
     float gemm_beta = (T)0.0;
-    rocblas_set_stream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   rocblas_operation_none,
-                   rocblas_operation_none,
-                   weight.size(1),
-                   bsz,
-                   input.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)input_cont.data_ptr(),
-                   (T*)intermediate.data_ptr(),
+    if (q_int8) {
+        quantized_gemm<T>(intermediate.data_ptr(),
+                          (T*)input.data_ptr(),
+                          weight,
+                          weight_scale,
+                          weight_scale.size(0),
+                          bsz,
+                          input.size(2));
+    } else {
+        rocblas_set_stream(Context::Instance().GetCublasHandle(),
+                        Context::Instance().GetCurrentStream());
+        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+                       rocblas_operation_none,
+                       rocblas_operation_none,
+                       intm_dim,
+                       bsz,
+                       input.size(2),
+                       &alpha,
+                       &gemm_beta,
+                       (T*)weight.data_ptr(),
+                       (T*)input.data_ptr(),
+                       (T*)intermediate.data_ptr(),
 #ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
+                       rocblas_gemm_algo_standard);
 #else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+                       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
+    }
     launch_bias_gelu((T*)intermediate.data_ptr(),
                      (T*)bias.data_ptr(),
-                     weight.size(1),
+                     intm_dim,
                      bsz,
                      Context::Instance().GetCurrentStream());
 
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   rocblas_operation_none,
-                   rocblas_operation_none,
-                   weight_out.size(1),
-                   bsz,
-                   intermediate.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight_out.data_ptr(),
-                   (T*)intermediate.data_ptr(),
-                   (T*)output.data_ptr(),
+    int out_size = q_int8 ? weight_out.size(0) : weight_out.size(1);
+    auto output = at::empty({input.size(0), input.size(1), out_size}, options);
+    if (q_int8) {
+        quantized_gemm<T>(output.data_ptr(),
+                          (T*)intermediate.data_ptr(),
+                          weight_out,
+                          weight_out_scale,
+                          weight_out_scale.size(0),
+                          bsz,
+                          input.size(2));
+    } else {
+        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+                       rocblas_operation_none,
+                       rocblas_operation_none,
+                       out_size,
+                       bsz,
+                       intm_dim,
+                       &alpha,
+                       &gemm_beta,
+                       (T*)weight_out.data_ptr(),
+                       (T*)intermediate.data_ptr(),
+                       (T*)output.data_ptr(),
 #ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
+                       rocblas_gemm_algo_standard);
 #else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+                       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
+    }
     // hipEventRecord(Context::Instance().GetCompEvent(2),
     //                Context::Instance().GetCurrentStream(true));
     return output;
 }
 
-void residual_add_bias(at::Tensor& output,
-                       at::Tensor& input,
-                       at::Tensor& attention_output,
-                       at::Tensor& output_b,
-                       at::Tensor& attention_b,
-                       int mp_size,
-                       bool mlp_after_attn)
+template <typename T>
+at::Tensor& residual_add_bias(at::Tensor& hidden_state,
+                              at::Tensor& residual,
+                              const at::Tensor& attention_output,
+                              const at::Tensor& attention_bias,
+                              const at::Tensor& final_bias,
+                              const int mp_size,
+                              const bool mlp_after_attn,
+                              const bool add_bias,
+                              const bool preln)
 {
-    int bsz = input.size(0) * input.size(1);
-    int hidden_size = input.size(2);
-    // hipStreamWaitEvent(
-    //    Context::Instance().GetCurrentStream(), Context::Instance().GetCompEvent(2), 0);
-    if (input.scalar_type() == at::kFloat)
-        if (mlp_after_attn)
-            launch_bias_residual((float*)input.data_ptr(),
-                                 (float*)output.data_ptr(),
-                                 (float*)attention_output.data_ptr(),
-                                 (float*)output_b.data_ptr(),
-                                 (float*)attention_b.data_ptr(),
-                                 bsz,
-                                 hidden_size,
-                                 mp_size,
-                                 Context::Instance().GetCurrentStream());
-        else
-            launch_gptj_residual_add<float>((float*)input.data_ptr(),
-                                            (float*)output.data_ptr(),
-                                            (float*)attention_output.data_ptr(),
-                                            (float*)output_b.data_ptr(),
-                                            (float*)attention_b.data_ptr(),
-                                            hidden_size,
-                                            bsz,
-                                            mp_size,
-                                            Context::Instance().GetCurrentStream());
-    else if (mlp_after_attn)
-        launch_bias_residual((__half*)input.data_ptr(),
-                             (__half*)output.data_ptr(),
-                             (__half*)attention_output.data_ptr(),
-                             (__half*)output_b.data_ptr(),
-                             (__half*)attention_b.data_ptr(),
+    int bsz = residual.size(0) * residual.size(1);
+    int hidden_size = residual.size(2);
+    if (mlp_after_attn)
+        launch_bias_residual(static_cast<T*>(residual.data_ptr()),
+                             static_cast<T*>(hidden_state.data_ptr()),
+                             static_cast<T*>(attention_output.data_ptr()),
+                             static_cast<T*>(final_bias.data_ptr()),
+                             static_cast<T*>(attention_bias.data_ptr()),
                              bsz,
                              hidden_size,
                              mp_size,
+                             preln,
                              Context::Instance().GetCurrentStream());
     else
-        launch_gptj_residual_add<__half>((__half*)input.data_ptr(),
-                                         (__half*)output.data_ptr(),
-                                         (__half*)attention_output.data_ptr(),
-                                         (__half*)output_b.data_ptr(),
-                                         (__half*)attention_b.data_ptr(),
-                                         hidden_size,
-                                         bsz,
-                                         mp_size,
-                                         Context::Instance().GetCurrentStream());
+        launch_gptj_residual_add<T>(
+            static_cast<T*>(residual.data_ptr()),
+            static_cast<T*>(hidden_state.data_ptr()),
+            static_cast<T*>(attention_output.data_ptr()),
+            static_cast<T*>(final_bias.data_ptr()),
+            static_cast<T*>((add_bias ? attention_bias.data_ptr() : nullptr)),
+            hidden_size,
+            bsz,
+            mp_size,
+            Context::Instance().GetCurrentStream());
+    return residual;
 }
 
 std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query,
@@ -833,7 +1628,8 @@ std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query,
                                            bsz,
                                            rotate_half,
                                            rotate_every_two,
-                                           Context::Instance().GetCurrentStream());
+                                           Context::Instance().GetCurrentStream(),
+                                           Context::Instance().GetMaxTokenLenght());
     else
         launch_apply_rotary_pos_emb<__half>((__half*)query_cont.data_ptr(),
                                             (__half*)key_cont.data_ptr(),
@@ -845,7 +1641,8 @@ std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query,
                                             bsz,
                                             rotate_half,
                                             rotate_every_two,
-                                            Context::Instance().GetCurrentStream());
+                                            Context::Instance().GetCurrentStream(),
+                                            Context::Instance().GetMaxTokenLenght());
     return {query_cont, key_cont};
 }
 
@@ -905,22 +1702,34 @@ at::Tensor moe_res_matmul(at::Tensor& moe_res, at::Tensor& coef, at::Tensor& out
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("softmax_fp32", &ds_softmax<float>, "DeepSpeed SoftMax with fp32 (CUDA)");
-    m.def("softmax_fp16", &ds_softmax<__half>, "DeepSpeed SoftMax with fp32 (CUDA)");
+    m.def("softmax_fp16", &ds_softmax<__half>, "DeepSpeed SoftMax with fp16 (CUDA)");
     m.def(
         "softmax_context_fp32", &ds_softmax_context<float>, "DeepSpeed attention with fp32 (CUDA)");
     m.def("softmax_context_fp16",
           &ds_softmax_context<__half>,
-          "DeepSpeed attention with fp32 (CUDA)");
+          "DeepSpeed attention with fp16 (CUDA)");
+    m.def("softmax_context_int8",
+          &ds_softmax_context1<__half>,
+          "DeepSpeed attention with int8 (CUDA)");
     m.def("bias_gelu_fp32", &ds_bias_gelu<float>, "DeepSpeed Gelu with fp32 (CUDA)");
-    m.def("bias_gelu_fp16", &ds_bias_gelu<__half>, "DeepSpeed Gelu with fp32 (CUDA)");
+    m.def("bias_gelu_fp16", &ds_bias_gelu<__half>, "DeepSpeed Gelu with fp16 (CUDA)");
+    m.def("bias_geglu", &ds_bias_geglu, "DeepSpeed Bias GEGLU (CUDA)");
+    m.def("bias_add_fp32", &ds_bias_add<float>, "DeepSpeed Bias Add with fp32 (CUDA)");
+    m.def("bias_add_fp16", &ds_bias_add<__half>, "DeepSpeed Gelu with fp16 (CUDA)");
+    m.def("bias_relu_fp32", &ds_bias_relu<float>, "DeepSpeed ReLU with fp32 (CUDA)");
+    m.def("bias_relu_fp16", &ds_bias_relu<__half>, "DeepSpeed ReLU with fp16 (CUDA)");
     m.def("bias_residual_fp32",
           &ds_bias_residual<float>,
           "DeepSpeed residual-bias add with fp32 (CUDA)");
     m.def("bias_residual_fp16",
           &ds_bias_residual<__half>,
-          "DeepSpeed residual-bias add with fp32 (CUDA)");
-    m.def("layer_norm_fp32", &ds_layernorm<float>, "DeepSpeed layer-norm with fp32 (CUDA)");
-    m.def("layer_norm_fp16", &ds_layernorm<__half>, "DeepSpeed layer-norm with fp16 (CUDA)");
+          "DeepSpeed residual-bias add with fp16 (CUDA)");
+    m.def("layer_norm", &ds_layer_norm, "DeepSpeed layer norm (CUDA)");
+    m.def(
+        "_layer_norm_residual", &ds_layer_norm_residual, "DeepSpeed layer norm + residual (CUDA)");
+    m.def("layer_norm_residual_store_pre_ln_res",
+          &ds_layer_norm_residual_store_pre_ln_res,
+          "DeepSpeed layer norm + store pre Layernorm residual (CUDA)");
     m.def("qkv_gemm_fp32", &ds_qkv_gemm<float>, "DeepSpeed qkv gemm with fp32 (CUDA)");
     m.def("qkv_gemm_fp16", &ds_qkv_gemm<__half>, "DeepSpeed qkv gemm with fp16 (CUDA)");
     m.def("qkv_gemm_int8", &ds_qkv_gemm_int8<__half>, "DeepSpeed qkv gemm with int8 (CUDA)");
@@ -939,7 +1748,12 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
           "DeepSpeed linear_layer with int8 (CUDA)");
     m.def("fused_gemm_gelu_fp32", &fused_gemm_gelu<float>, "DeepSpeed mlp with fp32 (CUDA)");
     m.def("fused_gemm_gelu_fp16", &fused_gemm_gelu<__half>, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("residual_add", &residual_add_bias, "DeepSpeed mlp with fp16 (CUDA)");
+    m.def("residual_add_bias_fp32",
+          &residual_add_bias<float>,
+          "DeepSpeed residual add with fp32 (CUDA)");
+    m.def("residual_add_bias_fp16",
+          &residual_add_bias<__half>,
+          "DeepSpeed residual add with fp16 (CUDA)");
     m.def("apply_rotary_pos_emb", &apply_rotary_pos_emb, "DeepSpeed mlp with fp16 (CUDA)");
     m.def("einsum_sec_sm_ecm_fp32",
           &einsum_sec_sm_ecm<float>,
@@ -949,4 +1763,19 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
           &einsum_sec_sm_ecm<__half>,
           "DeepSpeed vector-MM with fp16 (CUDA)");
     m.def("moe_res_matmul", &moe_res_matmul, "DeepSpeed moe residual matmul (CUDA)");
+    m.def("add_padding_fp32", &add_padding<float>, "DeepSpeed residual add with fp32 (CUDA)");
+    m.def("add_padding_fp16", &add_padding<__half>, "DeepSpeed residual add with fp16 (CUDA)");
+    m.def("pad_transform_fp32",
+          &padd_add_transform<float>,
+          "DeepSpeed residual add with fp32 (CUDA)");
+    m.def("pad_transform_fp16",
+          &padd_add_transform<__half>,
+          "DeepSpeed residual add with fp16 (CUDA)");
+    m.def("allocate_workspace_fp32",
+          &allocate_workspace<float>,
+          "DeepSpeed memory allocation for GPT inference with fp32 (CUDA)");
+    m.def("allocate_workspace_fp16",
+          &allocate_workspace<__half>,
+          "DeepSpeed memory allocation for GPT inference with fp16 (CUDA)");
+    m.def("reset_cache", &reset_cache, "Reset Cache for generation tasks");
 }
diff --git a/csrc/transformer/inference/csrc/relu.cu b/csrc/transformer/inference/csrc/relu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..87e169a9194ff714fa15ecbef165944d95eca1e9
--- /dev/null
+++ b/csrc/transformer/inference/csrc/relu.cu
@@ -0,0 +1,63 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#include "conversion_utils.h"
+#include "inference_cuda_layers.h"
+#include "memory_access_utils.h"
+
+namespace cg = cooperative_groups;
+#define MAX_CAP 4
+#define MAX_SEQ 2048
+
+inline __device__ float relu(const float x) { return x < 0 ? 0 : x; }
+
+/*
+In-place relu(biasAdd(x)) for channels last
+*/
+template <typename T>
+__global__ void fused_bias_relu(T* input, const T* bias, int total_count, int intermediate_size)
+{
+    // Input restriction: intermediate_size % vals_per_access == 0
+    constexpr int granularity = 16;
+    constexpr int values_per_access = granularity / sizeof(T);
+    const int offset = (blockIdx.x * blockDim.x + threadIdx.x) * values_per_access;
+
+    if (offset < total_count) {
+        T data[values_per_access];
+        T data_bias[values_per_access];
+        mem_access::load_global<granularity>(data, input + offset);
+        mem_access::load_global<granularity>(data_bias, bias + (offset % intermediate_size));
+
+#pragma unroll
+        for (int i = 0; i < values_per_access; i++) {
+            float data_f = conversion::to<float>(data[i]);
+            float bias_f = conversion::to<float>(data_bias[i]);
+            data[i] = conversion::to<T>(relu(data_f + bias_f));
+        }
+
+        mem_access::store_global<granularity>(input + offset, data);
+    }
+}
+
+template <typename T>
+void launch_bias_relu(T* input,
+                      const T* bias,
+                      int intermediate_size,
+                      int batch_size,
+                      cudaStream_t stream)
+{
+    constexpr int threads = 1024;
+    constexpr int granularity = 16;
+
+    const int total_count = batch_size * intermediate_size;
+    const int elems_per_block = threads * (granularity / sizeof(T));
+    dim3 block_dims(threads);
+    dim3 grid_dims((total_count + elems_per_block - 1) / elems_per_block);
+
+    fused_bias_relu<<<grid_dims, block_dims, 0, stream>>>(
+        input, bias, total_count, intermediate_size);
+}
+
+template void launch_bias_relu<float>(float*, const float*, int, int, cudaStream_t);
+template void launch_bias_relu<__half>(__half*, const __half*, int, int, cudaStream_t);
diff --git a/csrc/transformer/inference/csrc/relu.hip b/csrc/transformer/inference/csrc/relu.hip
new file mode 100644
index 0000000000000000000000000000000000000000..a34d3fbe9357d941c9ec36b0c4884d7741d177a1
--- /dev/null
+++ b/csrc/transformer/inference/csrc/relu.hip
@@ -0,0 +1,65 @@
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#include "conversion_utils.h"
+#include "inference_cuda_layers.h"
+#include "memory_access_utils.h"
+
+namespace cg = cooperative_groups;
+#define MAX_CAP 4
+#define MAX_SEQ 2048
+
+inline __device__ float relu(const float x) { return x < 0 ? 0 : x; }
+
+/*
+In-place relu(biasAdd(x)) for channels last
+*/
+template <typename T>
+__global__ void fused_bias_relu(T* input, const T* bias, int total_count, int intermediate_size)
+{
+    // Input restriction: intermediate_size % vals_per_access == 0
+    constexpr int granularity = 16;
+    constexpr int values_per_access = granularity / sizeof(T);
+    const int offset = (blockIdx.x * blockDim.x + threadIdx.x) * values_per_access;
+
+    if (offset < total_count) {
+        T data[values_per_access];
+        T data_bias[values_per_access];
+        mem_access::load_global<granularity>(data, input + offset);
+        mem_access::load_global<granularity>(data_bias, bias + (offset % intermediate_size));
+
+#pragma unroll
+        for (int i = 0; i < values_per_access; i++) {
+            float data_f = conversion::to<float>(data[i]);
+            float bias_f = conversion::to<float>(data_bias[i]);
+            data[i] = conversion::to<T>(relu(data_f + bias_f));
+        }
+
+        mem_access::store_global<granularity>(input + offset, data);
+    }
+}
+
+template <typename T>
+void launch_bias_relu(T* input,
+                      const T* bias,
+                      int intermediate_size,
+                      int batch_size,
+                      hipStream_t stream)
+{
+    constexpr int threads = 1024;
+    constexpr int granularity = 16;
+
+    const int total_count = batch_size * intermediate_size;
+    const int elems_per_block = threads * (granularity / sizeof(T));
+    dim3 block_dims(threads);
+    dim3 grid_dims((total_count + elems_per_block - 1) / elems_per_block);
+
+   hipLaunchKernelGGL(( fused_bias_relu), dim3(grid_dims), dim3(block_dims), 0, stream, 
+        input, bias, total_count, intermediate_size);
+}
+
+template void launch_bias_relu<float>(float*, const float*, int, int, hipStream_t);
+template void launch_bias_relu<__half>(__half*, const __half*, int, int, hipStream_t);
diff --git a/csrc/transformer/inference/csrc/softmax.cu b/csrc/transformer/inference/csrc/softmax.cu
index bf3c8bc90049ddd9cf91ce4006729d02ebcdcf3e..c5f04176203b444c196bff72827850b1bb8cc6ad 100644
--- a/csrc/transformer/inference/csrc/softmax.cu
+++ b/csrc/transformer/inference/csrc/softmax.cu
@@ -1,5 +1,9 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
 #include <limits>
-#include "custom_cuda_layers.h"
+#include "inference_cuda_layers.h"
 
 #ifndef __HIP_PLATFORM_HCC__
 #include <cuda_profiler_api.h>
@@ -8,7 +12,7 @@
 #include <cstdlib>
 #include <ctime>
 
-#define ATTN_THREADS 1024
+#define ATTN_THREADS 256
 #define MAX_REG_SIZE 8
 
 #define minus_infinity -10000.0
@@ -28,6 +32,8 @@ namespace cg = cooperative_groups;
 
 __global__ void attn_softmax_v2(__half* vals,
                                 __half* mask,
+                                __half* alibi,
+                                float layer_scale,
                                 bool triangular,
                                 bool recompute,
                                 bool local_attention,
@@ -36,19 +42,18 @@ __global__ void attn_softmax_v2(__half* vals,
                                 int heads,
                                 int sequence_length,
                                 int num_seq,
-                                float scale,
+                                int head_offset,
+                                int mask_stride,
+                                int mp_size,
                                 int iterations,
                                 int reduceWidth)
 {
-#ifdef HALF_PRECISION_AVAILABLE
-
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     float2 low_data[MAX_REG_SIZE];
     float2 high_data[MAX_REG_SIZE];
-
-    __half2 h_scale = __float2half2_rn(scale);
+    const __half zero_h = __float2half(0.f);
 
     int wid = threadIdx.x >> 5;
     int lane = threadIdx.x & 0x1f;
@@ -60,11 +65,15 @@ __global__ void attn_softmax_v2(__half* vals,
     __shared__ float partialSum[MAX_WARP_NUM];
 
     int iter_offset = blockIdx.x * (warp_num / reduce_blocks) + (wid / reduce_blocks);
+    int batch_idx = iter_offset / (num_seq * heads);
+    int alibi_offset = batch_idx * heads * mp_size + head_offset;
+    int mask_offset = batch_idx * mask_stride + (iter_offset % mask_stride);
 
     if (iter_offset < total_count) {
         vals += (iter_offset * sequence_length);
 
-        int mask_offset = (iter_offset / (heads * num_seq)) * (sequence_length);
+        alibi_offset = (alibi_offset + ((iter_offset / num_seq) % heads)) * sequence_length;
+        mask_offset = mask_offset * sequence_length;
         int seq_id = iter_offset % num_seq;
         int seq_id4 = seq_id >> 2;
 
@@ -76,47 +85,67 @@ __global__ void attn_softmax_v2(__half* vals,
             (local_attention && real_seq_id >= window_size) ? real_seq_id - window_size : -1;
 
         float max_val = minus_infinity;
-
+        // if (lane == 0) printf("%d, %d: %d \n", wid, blockIdx.x, mask_offset);
         for (int i = 0; i < iterations; i++) {
             int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
             if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
                 data_id < sequence_length) {
                 if ((sequence_length - data_id) >= 4) {
-                    low_data[i].x = data_id > window_stride ? __half2float(vals[data_id])
-                                                            : minus_infinity;
+                    low_data[i].x = data_id > window_stride
+                                        ? __half2float(vals[data_id]) * layer_scale
+                                        : minus_infinity;
                     low_data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
                                      (data_id + 1) > window_stride)
-                                        ? __half2float(vals[data_id + 1])
+                                        ? __half2float(vals[data_id + 1]) * layer_scale
                                         : minus_infinity;
                     high_data[i].x = ((!triangular || ((data_id + 2) <= seq_id)) &&
                                       (data_id + 2) > window_stride)
-                                         ? __half2float(vals[data_id + 2])
+                                         ? __half2float(vals[data_id + 2]) * layer_scale
                                          : minus_infinity;
                     high_data[i].y = ((!triangular || ((data_id + 3) <= seq_id)) &&
                                       (data_id + 3) > window_stride)
-                                         ? __half2float(vals[data_id + 3])
+                                         ? __half2float(vals[data_id + 3]) * layer_scale
                                          : minus_infinity;
-                    if (mask && recompute) {
+                    if (alibi) {
+                        low_data[i].x = low_data[i].x + __half2float(alibi[data_id + alibi_offset]);
+                        low_data[i].y =
+                            low_data[i].y + __half2float(alibi[data_id + alibi_offset + 1]);
+                        high_data[i].x =
+                            high_data[i].x + __half2float(alibi[data_id + alibi_offset + 2]);
+                        high_data[i].y =
+                            high_data[i].y + __half2float(alibi[data_id + alibi_offset + 3]);
+                    }
+                    if (mask) {
                         low_data[i].x += __half2float(mask[data_id + mask_offset]);
                         low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
                         high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
                         high_data[i].y += __half2float(mask[data_id + mask_offset + 3]);
                     }
                 } else {
-                    low_data[i].x = data_id > window_stride ? __half2float(vals[data_id])
-                                                            : minus_infinity;
+                    low_data[i].x = data_id > window_stride
+                                        ? __half2float(vals[data_id]) * layer_scale
+                                        : minus_infinity;
                     low_data[i].y = (((!triangular || (data_id + 1) <= seq_id) &&
                                       (data_id + 1) > window_stride) &&
                                      (data_id + 1) < sequence_length)
-                                        ? __half2float(vals[data_id + 1])
+                                        ? __half2float(vals[data_id + 1]) * layer_scale
                                         : minus_infinity;
                     high_data[i].x = (((!triangular || (data_id + 2) <= seq_id) &&
                                        (data_id + 2) > window_stride) &&
                                       (data_id + 2) < sequence_length)
-                                         ? __half2float(vals[data_id + 2])
+                                         ? __half2float(vals[data_id + 2]) * layer_scale
                                          : minus_infinity;
+                    if (alibi) {
+                        low_data[i].x = low_data[i].x + __half2float(alibi[data_id + alibi_offset]);
+                        if ((data_id + 1) < sequence_length)
+                            low_data[i].y =
+                                low_data[i].y + __half2float(alibi[data_id + alibi_offset + 1]);
+                        if ((data_id + 2) < sequence_length)
+                            high_data[i].x =
+                                high_data[i].x + __half2float(alibi[data_id + alibi_offset + 2]);
+                    }
                     high_data[i].y = minus_infinity;
-                    if (mask && recompute) {
+                    if (mask) {
                         low_data[i].x += __half2float(mask[data_id + mask_offset]);
                         if ((data_id + 1) < sequence_length)
                             low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
@@ -187,23 +216,26 @@ __global__ void attn_softmax_v2(__half* vals,
 
             if (data_id < sequence_length) {
                 if ((sequence_length - data_id) >= 4) {
-                    vals[data_id] = low_data[i].x / sum;
-                    vals[data_id + 1] = low_data[i].y / sum;
-                    vals[data_id + 2] = high_data[i].x / sum;
-                    vals[data_id + 3] = high_data[i].y / sum;
+                    vals[data_id] = __float2half(low_data[i].x / sum);
+                    vals[data_id + 1] = __float2half(low_data[i].y / sum);
+                    vals[data_id + 2] = __float2half(high_data[i].x / sum);
+                    vals[data_id + 3] = __float2half(high_data[i].y / sum);
                 } else {
-                    vals[data_id] = low_data[i].x / sum;
-                    if ((data_id + 1) < sequence_length) vals[data_id + 1] = low_data[i].y / sum;
-                    if ((data_id + 2) < sequence_length) vals[data_id + 2] = high_data[i].x / sum;
+                    vals[data_id] = __float2half(low_data[i].x / sum);
+                    if ((data_id + 1) < sequence_length)
+                        vals[data_id + 1] = __float2half(low_data[i].y / sum);
+                    if ((data_id + 2) < sequence_length)
+                        vals[data_id + 2] = __float2half(high_data[i].x / sum);
                 }
             }
         }
     }
-#endif
 }
 
 __global__ void attn_softmax_v2(float* vals,
                                 float* attn_mask,
+                                float* alibi,
+                                float layer_scale,
                                 bool triangular,
                                 bool recompute,
                                 bool local_attention,
@@ -212,7 +244,9 @@ __global__ void attn_softmax_v2(float* vals,
                                 int heads,
                                 int sequence_length,
                                 int num_seq,
-                                float scale,
+                                int head_offset,
+                                int mask_stride,
+                                int mp_size,
                                 int iterations,
                                 int reduceWidth)
 {
@@ -234,7 +268,10 @@ __global__ void attn_softmax_v2(float* vals,
     if (iter_offset < total_count) {
         vals += (iter_offset * sequence_length);
 
-        int mask_offset = (iter_offset / (heads * num_seq)) * (sequence_length);
+        int batch_idx = iter_offset / (num_seq * heads);
+        int alibi_offset = batch_idx * heads * mp_size + head_offset;
+        int mask_offset = batch_idx * mask_stride + (iter_offset % mask_stride);
+        mask_offset = mask_offset * sequence_length;
         int seq_id = iter_offset % num_seq;
         int seq_id4 = seq_id >> 2;
 
@@ -265,7 +302,7 @@ __global__ void attn_softmax_v2(float* vals,
                                  (data_id + 3) > window_stride)
                                     ? vals[data_id + 3]
                                     : minus_infinity;
-                    if (attn_mask && recompute) {
+                    if (attn_mask) {
                         data[i].x += attn_mask[data_id + mask_offset];
                         data[i].y += attn_mask[data_id + mask_offset + 1];
                         data[i].z += attn_mask[data_id + mask_offset + 2];
@@ -282,7 +319,7 @@ __global__ void attn_softmax_v2(float* vals,
                                     ? (vals[data_id + 2])
                                     : minus_infinity;
                     data[i].w = minus_infinity;
-                    if (attn_mask && recompute) {
+                    if (attn_mask) {
                         data[i].x += attn_mask[data_id + mask_offset];
                         if ((data_id + 1) < sequence_length)
                             data[i].y += attn_mask[data_id + mask_offset + 1];
@@ -371,6 +408,8 @@ __global__ void attn_softmax_v2(float* vals,
 template <typename T>
 void launch_attn_softmax_v2(T* vals,
                             T* mask,
+                            T* alibi,
+                            float layer_scale,
                             bool triangular,
                             bool recompute,
                             bool local_attention,
@@ -379,37 +418,46 @@ void launch_attn_softmax_v2(T* vals,
                             int heads,
                             int num_seq,
                             int sequence_length,
-                            float scale,
+                            int head_offset,
+                            int mask_stride,
+                            int mp_size,
                             cudaStream_t stream)
 {
     int total_count = batch_size * heads * num_seq;
-    dim3 grid_dim((total_count - 1) / (WARP_SIZE / ((sequence_length - 1) / ATTN_THREADS + 1)) + 1);
+    int warp_num = ATTN_THREADS / WARP_SIZE;
+    int reduce_width = ((sequence_length - 1) / ATTN_THREADS + 1);
+    reduce_width = (int)pow(2.0, floor(log2((float)(reduce_width)))) * WARP_SIZE;
+    dim3 grid_dim((total_count - 1) / (ATTN_THREADS / reduce_width) + 1);
     dim3 block_dim(ATTN_THREADS);
 
-    const int reduce_width = ((sequence_length - 1) / ATTN_THREADS + 1) * WARP_SIZE;
     const int iterations = (sequence_length - 1) / (reduce_width << 2) + 1;
 
     if (sequence_length <= 32768)
-        attn_softmax_v2<<<grid_dim, block_dim, 0, stream>>>(
-            vals,
-            mask,
-            triangular,
-            recompute,
-            local_attention,
-            window_size,
-            total_count,
-            (triangular ? (heads * batch_size) : heads),
-            sequence_length,
-            num_seq,
-            scale,
-            iterations,
-            reduce_width);
+        attn_softmax_v2<<<grid_dim, block_dim, 0, stream>>>(vals,
+                                                            mask,
+                                                            alibi,
+                                                            layer_scale,
+                                                            triangular,
+                                                            recompute,
+                                                            local_attention,
+                                                            window_size,
+                                                            total_count,
+                                                            heads,
+                                                            sequence_length,
+                                                            num_seq,
+                                                            head_offset,
+                                                            mask_stride,
+                                                            mp_size,
+                                                            iterations,
+                                                            reduce_width);
     else
         throw std::runtime_error("Unsupport Seq_Length!");
 }
 
 template void launch_attn_softmax_v2(float* vals,
                                      float* mask,
+                                     float* alibi,
+                                     float layer_scale,
                                      bool triangular,
                                      bool recompute,
                                      bool local_attention,
@@ -418,10 +466,14 @@ template void launch_attn_softmax_v2(float* vals,
                                      int heads,
                                      int num_seq,
                                      int sequence_length,
-                                     float scale,
+                                     int head_offset,
+                                     int mask_stride,
+                                     int mp_size,
                                      cudaStream_t stream);
 template void launch_attn_softmax_v2(__half* vals,
                                      __half* mask,
+                                     __half* alibi,
+                                     float layer_scale,
                                      bool triangular,
                                      bool recompute,
                                      bool local_attention,
@@ -430,5 +482,7 @@ template void launch_attn_softmax_v2(__half* vals,
                                      int heads,
                                      int num_seq,
                                      int sequence_length,
-                                     float scale,
+                                     int head_offset,
+                                     int mask_stride,
+                                     int mp_size,
                                      cudaStream_t stream);
diff --git a/csrc/transformer/inference/csrc/softmax.hip b/csrc/transformer/inference/csrc/softmax.hip
index 51d5bef3a72436a23f910b3a73ada214b012389a..770120cc2373556931710f48d18717582a6dbea6 100644
--- a/csrc/transformer/inference/csrc/softmax.hip
+++ b/csrc/transformer/inference/csrc/softmax.hip
@@ -1,7 +1,11 @@
 // !!! This is a file automatically generated by hipify!!!
 #include "hip/hip_runtime.h"
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
 #include <limits>
-#include "custom_hip_layers.h"
+#include "inference_cuda_layers.h"
 
 #ifndef __HIP_PLATFORM_HCC__
 #include <cuda_profiler_api.h>
@@ -10,7 +14,7 @@
 #include <cstdlib>
 #include <ctime>
 
-#define ATTN_THREADS 1024
+#define ATTN_THREADS 256
 #define MAX_REG_SIZE 8
 
 #define minus_infinity -10000.0
@@ -30,6 +34,8 @@ namespace cg = cooperative_groups;
 
 __global__ void attn_softmax_v2(__half* vals,
                                 __half* mask,
+                                __half* alibi,
+                                float layer_scale,
                                 bool triangular,
                                 bool recompute,
                                 bool local_attention,
@@ -38,19 +44,18 @@ __global__ void attn_softmax_v2(__half* vals,
                                 int heads,
                                 int sequence_length,
                                 int num_seq,
-                                float scale,
+                                int head_offset,
+                                int mask_stride,
+                                int mp_size,
                                 int iterations,
                                 int reduceWidth)
 {
-#ifdef HALF_PRECISION_AVAILABLE
-
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     float2 low_data[MAX_REG_SIZE];
     float2 high_data[MAX_REG_SIZE];
-
-    __half2 h_scale = __float2half2_rn(scale);
+    const __half zero_h = __float2half(0.f);
 
     int wid = threadIdx.x >> 5;
     int lane = threadIdx.x & 0x1f;
@@ -62,11 +67,15 @@ __global__ void attn_softmax_v2(__half* vals,
     __shared__ float partialSum[MAX_WARP_NUM];
 
     int iter_offset = blockIdx.x * (warp_num / reduce_blocks) + (wid / reduce_blocks);
+    int batch_idx = iter_offset / (num_seq * heads);
+    int alibi_offset = batch_idx * heads * mp_size + head_offset;
+    int mask_offset = batch_idx * mask_stride + (iter_offset % mask_stride);
 
     if (iter_offset < total_count) {
         vals += (iter_offset * sequence_length);
 
-        int mask_offset = (iter_offset / (heads * num_seq)) * (sequence_length);
+        alibi_offset = (alibi_offset + ((iter_offset / num_seq) % heads)) * sequence_length;
+        mask_offset = mask_offset * sequence_length;
         int seq_id = iter_offset % num_seq;
         int seq_id4 = seq_id >> 2;
 
@@ -78,47 +87,67 @@ __global__ void attn_softmax_v2(__half* vals,
             (local_attention && real_seq_id >= window_size) ? real_seq_id - window_size : -1;
 
         float max_val = minus_infinity;
-
+        // if (lane == 0) printf("%d, %d: %d \n", wid, blockIdx.x, mask_offset);
         for (int i = 0; i < iterations; i++) {
             int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
             if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
                 data_id < sequence_length) {
                 if ((sequence_length - data_id) >= 4) {
-                    low_data[i].x = data_id > window_stride ? __half2float(vals[data_id])
-                                                            : minus_infinity;
+                    low_data[i].x = data_id > window_stride
+                                        ? __half2float(vals[data_id]) * layer_scale
+                                        : minus_infinity;
                     low_data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
                                      (data_id + 1) > window_stride)
-                                        ? __half2float(vals[data_id + 1])
+                                        ? __half2float(vals[data_id + 1]) * layer_scale
                                         : minus_infinity;
                     high_data[i].x = ((!triangular || ((data_id + 2) <= seq_id)) &&
                                       (data_id + 2) > window_stride)
-                                         ? __half2float(vals[data_id + 2])
+                                         ? __half2float(vals[data_id + 2]) * layer_scale
                                          : minus_infinity;
                     high_data[i].y = ((!triangular || ((data_id + 3) <= seq_id)) &&
                                       (data_id + 3) > window_stride)
-                                         ? __half2float(vals[data_id + 3])
+                                         ? __half2float(vals[data_id + 3]) * layer_scale
                                          : minus_infinity;
-                    if (mask && recompute) {
+                    if (alibi) {
+                        low_data[i].x = low_data[i].x + __half2float(alibi[data_id + alibi_offset]);
+                        low_data[i].y =
+                            low_data[i].y + __half2float(alibi[data_id + alibi_offset + 1]);
+                        high_data[i].x =
+                            high_data[i].x + __half2float(alibi[data_id + alibi_offset + 2]);
+                        high_data[i].y =
+                            high_data[i].y + __half2float(alibi[data_id + alibi_offset + 3]);
+                    }
+                    if (mask) {
                         low_data[i].x += __half2float(mask[data_id + mask_offset]);
                         low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
                         high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
                         high_data[i].y += __half2float(mask[data_id + mask_offset + 3]);
                     }
                 } else {
-                    low_data[i].x = data_id > window_stride ? __half2float(vals[data_id])
-                                                            : minus_infinity;
+                    low_data[i].x = data_id > window_stride
+                                        ? __half2float(vals[data_id]) * layer_scale
+                                        : minus_infinity;
                     low_data[i].y = (((!triangular || (data_id + 1) <= seq_id) &&
                                       (data_id + 1) > window_stride) &&
                                      (data_id + 1) < sequence_length)
-                                        ? __half2float(vals[data_id + 1])
+                                        ? __half2float(vals[data_id + 1]) * layer_scale
                                         : minus_infinity;
                     high_data[i].x = (((!triangular || (data_id + 2) <= seq_id) &&
                                        (data_id + 2) > window_stride) &&
                                       (data_id + 2) < sequence_length)
-                                         ? __half2float(vals[data_id + 2])
+                                         ? __half2float(vals[data_id + 2]) * layer_scale
                                          : minus_infinity;
+                    if (alibi) {
+                        low_data[i].x = low_data[i].x + __half2float(alibi[data_id + alibi_offset]);
+                        if ((data_id + 1) < sequence_length)
+                            low_data[i].y =
+                                low_data[i].y + __half2float(alibi[data_id + alibi_offset + 1]);
+                        if ((data_id + 2) < sequence_length)
+                            high_data[i].x =
+                                high_data[i].x + __half2float(alibi[data_id + alibi_offset + 2]);
+                    }
                     high_data[i].y = minus_infinity;
-                    if (mask && recompute) {
+                    if (mask) {
                         low_data[i].x += __half2float(mask[data_id + mask_offset]);
                         if ((data_id + 1) < sequence_length)
                             low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
@@ -189,23 +218,26 @@ __global__ void attn_softmax_v2(__half* vals,
 
             if (data_id < sequence_length) {
                 if ((sequence_length - data_id) >= 4) {
-                    vals[data_id] = low_data[i].x / sum;
-                    vals[data_id + 1] = low_data[i].y / sum;
-                    vals[data_id + 2] = high_data[i].x / sum;
-                    vals[data_id + 3] = high_data[i].y / sum;
+                    vals[data_id] = __float2half(low_data[i].x / sum);
+                    vals[data_id + 1] = __float2half(low_data[i].y / sum);
+                    vals[data_id + 2] = __float2half(high_data[i].x / sum);
+                    vals[data_id + 3] = __float2half(high_data[i].y / sum);
                 } else {
-                    vals[data_id] = low_data[i].x / sum;
-                    if ((data_id + 1) < sequence_length) vals[data_id + 1] = low_data[i].y / sum;
-                    if ((data_id + 2) < sequence_length) vals[data_id + 2] = high_data[i].x / sum;
+                    vals[data_id] = __float2half(low_data[i].x / sum);
+                    if ((data_id + 1) < sequence_length)
+                        vals[data_id + 1] = __float2half(low_data[i].y / sum);
+                    if ((data_id + 2) < sequence_length)
+                        vals[data_id + 2] = __float2half(high_data[i].x / sum);
                 }
             }
         }
     }
-#endif
 }
 
 __global__ void attn_softmax_v2(float* vals,
                                 float* attn_mask,
+                                float* alibi,
+                                float layer_scale,
                                 bool triangular,
                                 bool recompute,
                                 bool local_attention,
@@ -214,7 +246,9 @@ __global__ void attn_softmax_v2(float* vals,
                                 int heads,
                                 int sequence_length,
                                 int num_seq,
-                                float scale,
+                                int head_offset,
+                                int mask_stride,
+                                int mp_size,
                                 int iterations,
                                 int reduceWidth)
 {
@@ -236,7 +270,10 @@ __global__ void attn_softmax_v2(float* vals,
     if (iter_offset < total_count) {
         vals += (iter_offset * sequence_length);
 
-        int mask_offset = (iter_offset / (heads * num_seq)) * (sequence_length);
+        int batch_idx = iter_offset / (num_seq * heads);
+        int alibi_offset = batch_idx * heads * mp_size + head_offset;
+        int mask_offset = batch_idx * mask_stride + (iter_offset % mask_stride);
+        mask_offset = mask_offset * sequence_length;
         int seq_id = iter_offset % num_seq;
         int seq_id4 = seq_id >> 2;
 
@@ -267,7 +304,7 @@ __global__ void attn_softmax_v2(float* vals,
                                  (data_id + 3) > window_stride)
                                     ? vals[data_id + 3]
                                     : minus_infinity;
-                    if (attn_mask && recompute) {
+                    if (attn_mask) {
                         data[i].x += attn_mask[data_id + mask_offset];
                         data[i].y += attn_mask[data_id + mask_offset + 1];
                         data[i].z += attn_mask[data_id + mask_offset + 2];
@@ -284,7 +321,7 @@ __global__ void attn_softmax_v2(float* vals,
                                     ? (vals[data_id + 2])
                                     : minus_infinity;
                     data[i].w = minus_infinity;
-                    if (attn_mask && recompute) {
+                    if (attn_mask) {
                         data[i].x += attn_mask[data_id + mask_offset];
                         if ((data_id + 1) < sequence_length)
                             data[i].y += attn_mask[data_id + mask_offset + 1];
@@ -373,6 +410,8 @@ __global__ void attn_softmax_v2(float* vals,
 template <typename T>
 void launch_attn_softmax_v2(T* vals,
                             T* mask,
+                            T* alibi,
+                            float layer_scale,
                             bool triangular,
                             bool recompute,
                             bool local_attention,
@@ -381,37 +420,46 @@ void launch_attn_softmax_v2(T* vals,
                             int heads,
                             int num_seq,
                             int sequence_length,
-                            float scale,
+                            int head_offset,
+                            int mask_stride,
+                            int mp_size,
                             hipStream_t stream)
 {
     int total_count = batch_size * heads * num_seq;
-    dim3 grid_dim((total_count - 1) / (WARP_SIZE / ((sequence_length - 1) / ATTN_THREADS + 1)) + 1);
+    int warp_num = ATTN_THREADS / WARP_SIZE;
+    int reduce_width = ((sequence_length - 1) / ATTN_THREADS + 1);
+    reduce_width = (int)pow(2.0, floor(log2((float)(reduce_width)))) * WARP_SIZE;
+    dim3 grid_dim((total_count - 1) / (ATTN_THREADS / reduce_width) + 1);
     dim3 block_dim(ATTN_THREADS);
 
-    const int reduce_width = ((sequence_length - 1) / ATTN_THREADS + 1) * WARP_SIZE;
     const int iterations = (sequence_length - 1) / (reduce_width << 2) + 1;
 
     if (sequence_length <= 32768)
-       hipLaunchKernelGGL(( attn_softmax_v2), dim3(grid_dim), dim3(block_dim), 0, stream, 
-            vals,
-            mask,
-            triangular,
-            recompute,
-            local_attention,
-            window_size,
-            total_count,
-            (triangular ? (heads * batch_size) : heads),
-            sequence_length,
-            num_seq,
-            scale,
-            iterations,
-            reduce_width);
+       hipLaunchKernelGGL(( attn_softmax_v2), dim3(grid_dim), dim3(block_dim), 0, stream, vals,
+                                                            mask,
+                                                            alibi,
+                                                            layer_scale,
+                                                            triangular,
+                                                            recompute,
+                                                            local_attention,
+                                                            window_size,
+                                                            total_count,
+                                                            heads,
+                                                            sequence_length,
+                                                            num_seq,
+                                                            head_offset,
+                                                            mask_stride,
+                                                            mp_size,
+                                                            iterations,
+                                                            reduce_width);
     else
         throw std::runtime_error("Unsupport Seq_Length!");
 }
 
 template void launch_attn_softmax_v2(float* vals,
                                      float* mask,
+                                     float* alibi,
+                                     float layer_scale,
                                      bool triangular,
                                      bool recompute,
                                      bool local_attention,
@@ -420,10 +468,14 @@ template void launch_attn_softmax_v2(float* vals,
                                      int heads,
                                      int num_seq,
                                      int sequence_length,
-                                     float scale,
+                                     int head_offset,
+                                     int mask_stride,
+                                     int mp_size,
                                      hipStream_t stream);
 template void launch_attn_softmax_v2(__half* vals,
                                      __half* mask,
+                                     __half* alibi,
+                                     float layer_scale,
                                      bool triangular,
                                      bool recompute,
                                      bool local_attention,
@@ -432,5 +484,7 @@ template void launch_attn_softmax_v2(__half* vals,
                                      int heads,
                                      int num_seq,
                                      int sequence_length,
-                                     float scale,
+                                     int head_offset,
+                                     int mask_stride,
+                                     int mp_size,
                                      hipStream_t stream);
diff --git a/deepspeed/ops/csrc/transformer_bak/transform_kernels.cu b/csrc/transformer/inference/csrc/transform.cu
similarity index 55%
rename from deepspeed/ops/csrc/transformer_bak/transform_kernels.cu
rename to csrc/transformer/inference/csrc/transform.cu
index 15a2219333e43a6da1b93038a406b35d302bb9d9..023e02fe1c5271fba7100d66bb1a802b8759b3b3 100644
--- a/deepspeed/ops/csrc/transformer_bak/transform_kernels.cu
+++ b/csrc/transformer/inference/csrc/transform.cu
@@ -1,159 +1,345 @@
-#include "custom_cuda_layers.h"
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
 
-#define rows_trans 16
-#define cols_trans 16
-
-template <typename T>
-__global__ void Transpose_Kernel(const T* inp, T* out, int row_width, int col_width)
-{
-    __shared__ T data_block[rows_trans * (cols_trans + 1)];
+#ifndef __HIP_PLATFORM_HCC__
+#include <cuda_profiler_api.h>
+#endif
+#include "inference_cuda_layers.h"
+namespace cg = cooperative_groups;
 
-    int r = threadIdx.x / cols_trans;
-    int c = threadIdx.x % cols_trans;
+// Bias add
 
-    int m = row_width / cols_trans;
+__global__ void bias_add_transform_0213(float* output,
+                                        float* k_cache,
+                                        float* v_cache,
+                                        const float* vals,
+                                        const float* bias,
+                                        int hidden_dim,
+                                        int seq_length,
+                                        unsigned seq_offset,
+                                        int heads,
+                                        int rotary_dim,
+                                        bool rotate_half,
+                                        bool rotate_every_two,
+                                        int head_ext,
+                                        int max_out_tokens)
+{
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
 
-    int i = blockIdx.x / m * rows_trans + r;
-    int j = blockIdx.x % m * cols_trans + c;
+    int d0 = blockIdx.x;                                                  // Batch
+    int d1 = blockIdx.y;                                                  // Sequence ID (0-127)
+    int cnt = blockIdx.z / head_ext;                                      // Hidden count
+    int d2 = threadIdx.y + (blockIdx.z % head_ext) * (heads / head_ext);  // Head (0-11)
+    int d3 = threadIdx.x;                                                 // Values (groups of 4)
 
-    int row_stride = rows_trans / ((rows_trans * cols_trans + THREADS - 1) / THREADS);
+    int d2_out_stride = d2_stride * (cnt == 0 ? seq_length : max_out_tokens);
+    int d0_out_stride = hidden_dim * (cnt == 0 ? seq_length : max_out_tokens);
 
-    for (int k = 0; k < rows_trans; k += row_stride)
-        data_block[(k + r) * cols_trans + c] = inp[(i + k) * row_width + j];
+    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
+    float4* output_vec =
+        reinterpret_cast<float4*>(cnt == 0 ? output : (cnt == 1 ? k_cache : v_cache));
 
-    __syncthreads();
+    vals_vec += (d0 * d0_stride * (gridDim.z / head_ext));
+    vals_vec += (d1 * d1_stride * (gridDim.z / head_ext));
+    vals_vec += (cnt * d1_stride);
+    vals_vec += (d2 * d2_stride);
 
-    i = blockIdx.x % m * rows_trans + r;
-    j = blockIdx.x / m * cols_trans + c;
+    output_vec += (d1 * d2_stride);
+    output_vec += (d0 * d0_out_stride);
+    output_vec += (d2 * d2_out_stride);
 
-    for (int k = 0; k < rows_trans; k += row_stride)
-        out[(i + k) * col_width + j] = data_block[c * cols_trans + r + k];
+    unsigned seq_id = d1 + seq_offset;
+    float4 inputs = vals_vec[d3];
+    int lane = d3 & 0x1f;
+    if (cnt < 2 && rotary_dim > 0 && d3 < rotary_dim) {
+        float4 q = vals_vec[d3];
+        float2* q_f = reinterpret_cast<float2*>(&q);
+        if (rotate_every_two) {
+#pragma unroll
+            for (int o = 0; o < 2; o++) {
+                float inv_freq = (float)(((d3 << 1) + o) * 2) / (float)(rotary_dim << 2);
+                inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
+                q_f[o].x = (-1.0 * q_f[o].y * sinf(inv_freq) + q_f[o].x * cosf(inv_freq));
+                q_f[o].y = (q_f[o].x * sinf(inv_freq) + q_f[o].y * cosf(inv_freq));
+            }
+        }
+        output_vec[d3] = q;
+    } else
+        output_vec[d3] = inputs;
 }
 
-template <>
-void Transpose<__half>(const __half* inp_mat,
-                       __half* out_mat,
-                       int rows,
-                       int cols,
-                       cudaStream_t stream)
+#define ATTN_H 3
+#define MAX_SEQ_LINE 10
+
+__global__ void bias_add_transform_0213(__half* output,  // q
+                                        __half* k_cache,
+                                        __half* v_cache,
+                                        const __half* vals,  // qkv
+                                        const __half* bias,
+                                        int hidden_dim,
+                                        int seq_length,
+                                        unsigned seq_offset,
+                                        int all_tokens,
+                                        int heads,
+                                        int rotary_dim,
+                                        bool rotate_half,
+                                        bool rotate_every_two,
+                                        int head_ext,
+                                        int max_out_tokens)
 {
-    int threads = THREADS;
+    unsigned half_dim = (rotary_dim << 3) >> 1;
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
+
+    int d0 = blockIdx.x;                                                  // Batch
+    int d1 = blockIdx.y;                                                  // Sequence ID (0-127)
+    int cnt = blockIdx.z / head_ext;                                      // Hidden count
+    int d2 = threadIdx.y + (blockIdx.z % head_ext) * (heads / head_ext);  // Head (0-11)
+    int d3 = threadIdx.x;                                                 // Values (groups of 4)
+
+    int d2_out_stride = d2_stride * (cnt == 0 ? seq_length : max_out_tokens);
+    int d0_out_stride = hidden_dim * (cnt == 0 ? seq_length : max_out_tokens);
+
+    float4 vals_arr;
+    float4 output_arr;
+
+    __half2* vals_half = reinterpret_cast<__half2*>(&vals_arr);
+    __half2* output_half = reinterpret_cast<__half2*>(&output_arr);
 
-    Transpose_Kernel<__half><<<(rows * cols + threads - 1) / threads, threads, 0, stream>>>(
-        inp_mat, out_mat, cols, rows);
+    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
+    float4* output_vec =
+        reinterpret_cast<float4*>(cnt == 0 ? output : (cnt == 1 ? k_cache : v_cache));
+
+    vals_vec += (d0 * d0_stride * (gridDim.z / head_ext));
+    vals_vec += (d1 * d1_stride * (gridDim.z / head_ext));
+    vals_vec += (cnt * d1_stride);
+    vals_vec += (d2 * d2_stride);
+
+    output_vec += (d1 * d2_stride);
+    output_vec += (d0 * d0_out_stride);
+    output_vec += (d2 * d2_out_stride);
+
+    unsigned seq_id = d1 + seq_offset;
+
+    int lane = d3 & 0x1f;
+    if (cnt < 2 && rotary_dim > 0 && d3 < rotary_dim) {
+        float4 q = vals_vec[d3];
+        __half2* q_h = reinterpret_cast<__half2*>(&q);
+        if (rotate_every_two) {
+#pragma unroll
+            for (int o = 0; o < 4; o++) {
+                float inv_freq = (float)(((d3 << 2) + o) * 2) / (float)(rotary_dim << 3);
+                inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
+                float q_data[2];
+                q_data[0] = (float)q_h[o].x;
+                q_data[1] = (float)q_h[o].y;
+                q_h[o].x = (__half)(-1.0 * q_data[1] * sinf(inv_freq) + q_data[0] * cosf(inv_freq));
+                q_h[o].y = (__half)(q_data[0] * sinf(inv_freq) + q_data[1] * cosf(inv_freq));
+            }
+        }
+        output_vec[d3] = q;
+    } else
+        output_vec[d3] = vals_vec[d3];
 }
 
+// [B S C*H] - > C * [B A S N]
 template <>
-void Transpose<float>(const float* inp_mat, float* out_mat, int rows, int cols, cudaStream_t stream)
+void launch_bias_add_transform_0213<float>(float* output,
+                                           float* k_cache,
+                                           float* v_cache,
+                                           const float* vals,
+                                           const float* bias,
+                                           int batch_size,
+                                           int seq_length,
+                                           unsigned seq_offset,
+                                           int all_tokens,
+                                           int hidden_dim,
+                                           int heads,
+                                           int rotary_dim,
+                                           bool rotate_half,
+                                           bool rotate_every_two,
+                                           cudaStream_t stream,
+                                           int trans_count,
+                                           int max_out_tokens)
 {
-    int threads = THREADS;
+    hidden_dim >>= 2;
+    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
 
-    Transpose_Kernel<float><<<(rows * cols + threads - 1) / threads, threads, 0, stream>>>(
-        inp_mat, out_mat, cols, rows);
-}
+    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
+    dim3 grid_dim(batch_size, seq_length, (trans_count * head_ext));
 
+    bias_add_transform_0213<<<grid_dim, block_dim, 0, stream>>>(output,
+                                                                k_cache,
+                                                                v_cache,
+                                                                vals,
+                                                                bias,
+                                                                hidden_dim,
+                                                                seq_length,
+                                                                seq_offset,
+                                                                heads,
+                                                                rotary_dim >> 2,
+                                                                rotate_half,
+                                                                rotate_every_two,
+                                                                head_ext,
+                                                                max_out_tokens);
+}
 template <typename T>
-__global__ void transform_0213(T* output,
-                               const T* vals,
-                               int hidden_dim,
-                               int seq_length,
-                               int heads,
-                               int head_ext);
-
+void launch_bias_add_transform_0213(T* outputs,
+                                    T* vals,
+                                    T* vals1,
+                                    const T* vals2,
+                                    const T* bias,
+                                    int batch_size,
+                                    int seq_length,
+                                    unsigned seq_offset,
+                                    int seq_length1,
+                                    int hidden_dim,
+                                    int heads,
+                                    int rotary_dim,
+                                    bool rotate_half,
+                                    bool rotate_every_two,
+                                    cudaStream_t stream,
+                                    int trans_count,
+                                    int max_out_tokens);
 template <>
-__global__ void transform_0213<float>(float* output,
-                                      const float* vals,
-                                      int hidden_dim,
-                                      int seq_length,
-                                      int heads,
-                                      int head_ext)
+void launch_bias_add_transform_0213<__half>(__half* output,
+                                            __half* k_cache,
+                                            __half* v_cache,
+                                            const __half* vals,
+                                            const __half* bias,
+                                            int batch_size,
+                                            int seq_length,
+                                            unsigned seq_offset,
+                                            int all_tokens,
+                                            int hidden_dim,
+                                            int heads,
+                                            int rotary_dim,
+                                            bool rotate_half,
+                                            bool rotate_every_two,
+                                            cudaStream_t stream,
+                                            int trans_count,
+                                            int max_out_tokens)
 {
-    int d0_stride = hidden_dim * seq_length;
-    int d1_stride = hidden_dim;
-    int d2_stride = hidden_dim / heads;
-
-    int d0_out_stride = d0_stride;
-    int d1_out_stride = d2_stride;
-    int d2_out_stride = d2_stride * seq_length;
-
-    int d0 = blockIdx.x;                                                  // Batch
-    int d1 = blockIdx.y / head_ext;                                       // Sequence ID (0-127)
-    int d2 = threadIdx.y + (blockIdx.y % head_ext) * (heads / head_ext);  // Head (0-11)
-    int d3 = threadIdx.x;                                                 // Values (groups of 4)
+    hidden_dim >>= 3;
+    int head_ext = 1;  // (hidden_dim - 1) / MAX_THREADS + 1;
+    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
+    dim3 grid_dim(batch_size, seq_length, (trans_count * head_ext));
+    bias_add_transform_0213<<<grid_dim, block_dim, 0, stream>>>(output,
+                                                                k_cache,
+                                                                v_cache,
+                                                                vals,
+                                                                bias,
+                                                                hidden_dim,
+                                                                seq_length,
+                                                                seq_offset,
+                                                                all_tokens,
+                                                                heads,
+                                                                rotary_dim >> 3,
+                                                                rotate_half,
+                                                                rotate_every_two,
+                                                                head_ext,
+                                                                max_out_tokens);
+}
 
-    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
-    float4* output_vec = reinterpret_cast<float4*>(output);
+// Bias add
 
-    float4 inputs = vals_vec[d0 * d0_stride + d1 * d1_stride + d2 * d2_stride + d3];
-    output_vec[d0 * d0_out_stride + d1 * d1_out_stride + d2 * d2_out_stride + d3] = inputs;
+__global__ void pad_add_transform_0213(float* output,
+                                       const float* vals,
+                                       int hidden_dim,
+                                       int seq_length,
+                                       int padded_seq_len,
+                                       int heads,
+                                       int padded_head_size)
+{
 }
 
-template <>
-__global__ void transform_0213<__half>(__half* output,
+__global__ void pad_add_transform_0213(__half* output,
                                        const __half* vals,
                                        int hidden_dim,
                                        int seq_length,
+                                       int padded_seq_len,
                                        int heads,
-                                       int head_ext)
+                                       int padded_head_size)
 {
-#ifdef HALF_PRECISION_AVAILABLE
+    float4 ZERO;
+    const __half2 zero_h = __float2half2_rn(0.f);
+    __half2* ZERO_h = reinterpret_cast<__half2*>(&ZERO);
+#pragma unroll
+    for (int i = 0; i < 4; i++) ZERO_h[i] = zero_h;
 
     int d0_stride = hidden_dim * seq_length;
     int d1_stride = hidden_dim;
     int d2_stride = hidden_dim / heads;
 
-    int d0_out_stride = d0_stride;
-    int d1_out_stride = d2_stride;
-    int d2_out_stride = d2_stride * seq_length;
-
-    int d0 = blockIdx.x;                                                  // Batch
-    int d1 = blockIdx.y / head_ext;                                       // Sequence ID (0-127)
-    int d2 = threadIdx.y + (blockIdx.y % head_ext) * (heads / head_ext);  // Head (0-11)
-    int d3 = threadIdx.x;                                                 // Values (groups of 4)
+    int d0 = blockIdx.x;                             // Batch
+    int d1 = blockIdx.y * blockDim.z + threadIdx.z;  // Sequence ID (0-127)
+    int d2 = threadIdx.y;                            // Head (0-11)
+    int d3 = threadIdx.x;                            // Values (groups of 4)
 
-    float4 vals_arr[1];
+    int d2_out_stride = padded_head_size * padded_seq_len;
+    int d0_out_stride = heads * d2_out_stride;
 
     const float4* vals_vec = reinterpret_cast<const float4*>(vals);
     float4* output_vec = reinterpret_cast<float4*>(output);
 
-    vals_arr[0] = vals_vec[d0 * d0_stride + d1 * d1_stride + d2 * d2_stride + d3];
-    output_vec[d0 * d0_out_stride + d1 * d1_out_stride + d2 * d2_out_stride + d3] = vals_arr[0];
-#endif
-}
+    vals_vec += (d0 * d0_stride);
+    vals_vec += (d1 * d1_stride);
+    vals_vec += (d2 * d2_stride);
 
-template <>
-void launch_transform_0213<float>(float* output,
-                                  const float* vals,
-                                  int batch_size,
-                                  int seq_length,
-                                  int hidden_dim,
-                                  int heads,
-                                  cudaStream_t stream)
-{
-    hidden_dim >>= 2;
-    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
-    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
-    dim3 grid_dim(batch_size, (seq_length * head_ext));
+    output_vec += (d1 * padded_head_size);
+    output_vec += (d0 * d0_out_stride);
+    output_vec += (d2 * d2_out_stride);
 
-    transform_0213<float>
-        <<<grid_dim, block_dim, 0, stream>>>(output, vals, hidden_dim, seq_length, heads, head_ext);
+    if (d3 < d2_stride && d1 < seq_length)
+        output_vec[d3] = vals_vec[d3];
+    else
+        output_vec[d3] = ZERO;
 }
 
-template <>
-void launch_transform_0213<__half>(__half* output,
-                                   const __half* vals,
+template <typename T>
+void launch_pad_add_transform_0213(T* output,
+                                   const T* vals,
                                    int batch_size,
-                                   int seq_length,
                                    int hidden_dim,
+                                   int seq_length,
+                                   int padded_seq_len,
                                    int heads,
-                                   cudaStream_t stream)
+                                   int padded_head_size,
+                                   cudaStream_t stream);
+
+// [B S C*H] - > C * [B A S N]
+template <>
+void launch_pad_add_transform_0213<float>(float* output,
+                                          const float* vals,
+                                          int batch_size,
+                                          int hidden_dim,
+                                          int seq_length,
+                                          int padded_seq_len,
+                                          int heads,
+                                          int padded_head_size,
+                                          cudaStream_t stream)
+{
+}
+template <>
+void launch_pad_add_transform_0213<__half>(__half* output,
+                                           const __half* vals,
+                                           int batch_size,
+                                           int hidden_dim,
+                                           int seq_length,
+                                           int padded_seq_len,
+                                           int heads,
+                                           int padded_head_size,
+                                           cudaStream_t stream)
 {
     hidden_dim >>= 3;
-    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
-    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
-    dim3 grid_dim(batch_size, (seq_length * head_ext));
-    transform_0213<__half>
-        <<<grid_dim, block_dim, 0, stream>>>(output, vals, hidden_dim, seq_length, heads, head_ext);
+    dim3 block_dim((padded_head_size >> 3), heads, 2);
+    dim3 grid_dim(batch_size, padded_seq_len / 2);
+    pad_add_transform_0213<<<grid_dim, block_dim, 0, stream>>>(
+        output, vals, hidden_dim, seq_length, padded_seq_len, heads, padded_head_size >> 3);
 }
 
 // Bias add
@@ -207,9 +393,6 @@ __global__ void bias_add_transform_0213<float>(float* output,
                d2 * d2_out_stride + d3] = outputs;
 }
 
-#define ATTN_H 3
-#define MAX_SEQ_LINE 10
-
 template <>
 __global__ void bias_add_transform_0213<__half>(__half* output,
                                                 const __half* vals,
@@ -219,8 +402,6 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
                                                 int heads,
                                                 int head_ext)
 {
-#ifdef HALF_PRECISION_AVAILABLE
-
     int d0_stride = hidden_dim * seq_length;
     int d1_stride = hidden_dim;
     int d2_stride = hidden_dim / heads;
@@ -260,26 +441,11 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
     bias_arr = bias_vec[d3];
     vals_arr = vals_vec[d3];
 
-#if defined(__ACC_HALF__)
     output_half[0] = vals_half[0] + bias_half[0];
     output_half[1] = vals_half[1] + bias_half[1];
     output_half[2] = vals_half[2] + bias_half[2];
     output_half[3] = vals_half[3] + bias_half[3];
-#else
-    float2 bias_arr_f[4];
-    float2 vals_arr_f[4];
-#pragma unroll
-    for (int l = 0; l < 4; l++) {
-        bias_arr_f[l] = __half22float2(bias_half[l]);
-        vals_arr_f[l] = __half22float2(vals_half[l]);
-        vals_arr_f[l].x += bias_arr_f[l].x;
-        vals_arr_f[l].y += bias_arr_f[l].y;
-        output_half[l] = __float22half2_rn(vals_arr_f[l]);
-    }
-#endif
     output_vec[d3] = output_arr;
-
-#endif
 }
 
 __global__ void bias_add_transform_0213_v2(__half* output,
@@ -289,7 +455,6 @@ __global__ void bias_add_transform_0213_v2(__half* output,
                                            int seq_length,
                                            int heads)
 {
-#ifdef HALF_PRECISION_AVAILABLE
     __shared__ float4 in_data[3072];
 
     int d0_stride = hidden_dim * seq_length;
@@ -351,55 +516,6 @@ __global__ void bias_add_transform_0213_v2(__half* output,
         output_vec[out_index + iter_offset] =
             in_data[iter_row * d2_stride + d3 + (d2 % 2) * (d1_stride * blockDim.z)];
     }
-#endif
-}
-
-// [B S C*H] - > C * [B A S N]
-template <>
-void launch_bias_add_transform_0213<float>(float* output,
-                                           const float* vals,
-                                           const float* bias,
-                                           int batch_size,
-                                           int seq_length,
-                                           int hidden_dim,
-                                           int heads,
-                                           cudaStream_t stream,
-                                           int trans_count)
-{
-    hidden_dim >>= 2;
-    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
-
-    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
-    dim3 grid_dim(batch_size, seq_length, (trans_count * head_ext));
-
-    bias_add_transform_0213<float><<<grid_dim, block_dim, 0, stream>>>(
-        output, vals, bias, hidden_dim, seq_length, heads, head_ext);
-}
-
-template <>
-void launch_bias_add_transform_0213<__half>(__half* output,
-                                            const __half* vals,
-                                            const __half* bias,
-                                            int batch_size,
-                                            int seq_length,
-                                            int hidden_dim,
-                                            int heads,
-                                            cudaStream_t stream,
-                                            int trans_count)
-{
-    hidden_dim >>= 3;
-    if (hidden_dim > 128 || hidden_dim < 16) {
-        int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
-        dim3 block_dim(hidden_dim / heads, (heads / head_ext));
-        dim3 grid_dim(batch_size, seq_length, (trans_count * head_ext));
-        bias_add_transform_0213<__half><<<grid_dim, block_dim, 0, stream>>>(
-            output, vals, bias, hidden_dim, seq_length, heads, head_ext);
-    } else {
-        dim3 block_dim(hidden_dim / heads, heads, trans_count);
-        dim3 grid_dim(batch_size, seq_length / 2);
-        bias_add_transform_0213_v2<<<grid_dim, block_dim, 0, stream>>>(
-            output, vals, bias, hidden_dim, seq_length, heads);
-    }
 }
 
 template <typename T>
@@ -451,8 +567,6 @@ __global__ void transform4d_0213<__half>(__half* out,
                                          int hidden_dim,
                                          int head_ext)
 {
-#ifdef HALF_PRECISION_AVAILABLE
-
     int d0_stride = hidden_dim * (seq_length / head_ext);
     int d1_stride = hidden_dim;
     int d2_stride = hidden_dim / heads;
@@ -477,8 +591,6 @@ __global__ void transform4d_0213<__half>(__half* out,
     out_vec += (d2 * d1_stride * gridDim.y);
 
     out_vec[d3] = in_vec[d3];
-
-#endif
 }
 
 __global__ void transform4d_0213_v2(__half* out,
@@ -487,7 +599,6 @@ __global__ void transform4d_0213_v2(__half* out,
                                     int seq_length,
                                     int hidden_dim)
 {
-#ifdef HALF_PRECISION_AVAILABLE
     __shared__ float4 in_data[3072];
 
     int d0_stride = hidden_dim * seq_length;
@@ -528,7 +639,6 @@ __global__ void transform4d_0213_v2(__half* out,
         int iter_id = iter * iteration_stride + iter_index;
         out_vec[output_offset + iter_id] = in_data[iter_id];
     }
-#endif
 }
 
 // 3 * [B A S N] - > [B S C*H]
@@ -560,16 +670,9 @@ void launch_transform4d_0213<__half>(__half* out,
                                      int trans_count)
 {
     hidden_dim >>= 3;
-    if (hidden_dim > 128 || hidden_dim < 16) {
-        int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
-        dim3 grid_dims(batch_size, trans_count, (seq_length * head_ext));
-        dim3 block_dims(hidden_dim / heads, (heads / head_ext));
-        transform4d_0213<__half><<<grid_dims, block_dims, 0, stream>>>(
-            out, in, heads, seq_length, hidden_dim, head_ext);
-    } else {
-        dim3 grid_dims(batch_size, seq_length / 2);
-        dim3 block_dims(hidden_dim / heads, heads, trans_count);
-        transform4d_0213_v2<<<grid_dims, block_dims, 0, stream>>>(
-            out, in, heads, seq_length, hidden_dim);
-    }
+    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
+    dim3 grid_dims(batch_size, trans_count, (seq_length * head_ext));
+    dim3 block_dims(hidden_dim / heads, (heads / head_ext));
+    transform4d_0213<__half>
+        <<<grid_dims, block_dims, 0, stream>>>(out, in, heads, seq_length, hidden_dim, head_ext);
 }
diff --git a/deepspeed/ops/csrc/transformer_bak/transform_kernels.hip b/csrc/transformer/inference/csrc/transform.hip
similarity index 55%
rename from deepspeed/ops/csrc/transformer_bak/transform_kernels.hip
rename to csrc/transformer/inference/csrc/transform.hip
index 0aaa4cca150e18ed63c701e66ce4eaf6313e30ab..c9ff334aa052a9663dbf183974c0faca48e4db29 100644
--- a/deepspeed/ops/csrc/transformer_bak/transform_kernels.hip
+++ b/csrc/transformer/inference/csrc/transform.hip
@@ -1,161 +1,347 @@
 // !!! This is a file automatically generated by hipify!!!
 #include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
 
-#define rows_trans 16
-#define cols_trans 16
-
-template <typename T>
-__global__ void Transpose_Kernel(const T* inp, T* out, int row_width, int col_width)
-{
-    __shared__ T data_block[rows_trans * (cols_trans + 1)];
+#ifndef __HIP_PLATFORM_HCC__
+#include <cuda_profiler_api.h>
+#endif
+#include "inference_cuda_layers.h"
+namespace cg = cooperative_groups;
 
-    int r = threadIdx.x / cols_trans;
-    int c = threadIdx.x % cols_trans;
+// Bias add
 
-    int m = row_width / cols_trans;
+__global__ void bias_add_transform_0213(float* output,
+                                        float* k_cache,
+                                        float* v_cache,
+                                        const float* vals,
+                                        const float* bias,
+                                        int hidden_dim,
+                                        int seq_length,
+                                        unsigned seq_offset,
+                                        int heads,
+                                        int rotary_dim,
+                                        bool rotate_half,
+                                        bool rotate_every_two,
+                                        int head_ext,
+                                        int max_out_tokens)
+{
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
 
-    int i = blockIdx.x / m * rows_trans + r;
-    int j = blockIdx.x % m * cols_trans + c;
+    int d0 = blockIdx.x;                                                  // Batch
+    int d1 = blockIdx.y;                                                  // Sequence ID (0-127)
+    int cnt = blockIdx.z / head_ext;                                      // Hidden count
+    int d2 = threadIdx.y + (blockIdx.z % head_ext) * (heads / head_ext);  // Head (0-11)
+    int d3 = threadIdx.x;                                                 // Values (groups of 4)
 
-    int row_stride = rows_trans / ((rows_trans * cols_trans + THREADS - 1) / THREADS);
+    int d2_out_stride = d2_stride * (cnt == 0 ? seq_length : max_out_tokens);
+    int d0_out_stride = hidden_dim * (cnt == 0 ? seq_length : max_out_tokens);
 
-    for (int k = 0; k < rows_trans; k += row_stride)
-        data_block[(k + r) * cols_trans + c] = inp[(i + k) * row_width + j];
+    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
+    float4* output_vec =
+        reinterpret_cast<float4*>(cnt == 0 ? output : (cnt == 1 ? k_cache : v_cache));
 
-    __syncthreads();
+    vals_vec += (d0 * d0_stride * (gridDim.z / head_ext));
+    vals_vec += (d1 * d1_stride * (gridDim.z / head_ext));
+    vals_vec += (cnt * d1_stride);
+    vals_vec += (d2 * d2_stride);
 
-    i = blockIdx.x % m * rows_trans + r;
-    j = blockIdx.x / m * cols_trans + c;
+    output_vec += (d1 * d2_stride);
+    output_vec += (d0 * d0_out_stride);
+    output_vec += (d2 * d2_out_stride);
 
-    for (int k = 0; k < rows_trans; k += row_stride)
-        out[(i + k) * col_width + j] = data_block[c * cols_trans + r + k];
+    unsigned seq_id = d1 + seq_offset;
+    float4 inputs = vals_vec[d3];
+    int lane = d3 & 0x1f;
+    if (cnt < 2 && rotary_dim > 0 && d3 < rotary_dim) {
+        float4 q = vals_vec[d3];
+        float2* q_f = reinterpret_cast<float2*>(&q);
+        if (rotate_every_two) {
+#pragma unroll
+            for (int o = 0; o < 2; o++) {
+                float inv_freq = (float)(((d3 << 1) + o) * 2) / (float)(rotary_dim << 2);
+                inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
+                q_f[o].x = (-1.0 * q_f[o].y * sinf(inv_freq) + q_f[o].x * cosf(inv_freq));
+                q_f[o].y = (q_f[o].x * sinf(inv_freq) + q_f[o].y * cosf(inv_freq));
+            }
+        }
+        output_vec[d3] = q;
+    } else
+        output_vec[d3] = inputs;
 }
 
-template <>
-void Transpose<__half>(const __half* inp_mat,
-                       __half* out_mat,
-                       int rows,
-                       int cols,
-                       hipStream_t stream)
+#define ATTN_H 3
+#define MAX_SEQ_LINE 10
+
+__global__ void bias_add_transform_0213(__half* output,  // q
+                                        __half* k_cache,
+                                        __half* v_cache,
+                                        const __half* vals,  // qkv
+                                        const __half* bias,
+                                        int hidden_dim,
+                                        int seq_length,
+                                        unsigned seq_offset,
+                                        int all_tokens,
+                                        int heads,
+                                        int rotary_dim,
+                                        bool rotate_half,
+                                        bool rotate_every_two,
+                                        int head_ext,
+                                        int max_out_tokens)
 {
-    int threads = THREADS;
+    unsigned half_dim = (rotary_dim << 3) >> 1;
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
+
+    int d0 = blockIdx.x;                                                  // Batch
+    int d1 = blockIdx.y;                                                  // Sequence ID (0-127)
+    int cnt = blockIdx.z / head_ext;                                      // Hidden count
+    int d2 = threadIdx.y + (blockIdx.z % head_ext) * (heads / head_ext);  // Head (0-11)
+    int d3 = threadIdx.x;                                                 // Values (groups of 4)
+
+    int d2_out_stride = d2_stride * (cnt == 0 ? seq_length : max_out_tokens);
+    int d0_out_stride = hidden_dim * (cnt == 0 ? seq_length : max_out_tokens);
+
+    float4 vals_arr;
+    float4 output_arr;
+
+    __half2* vals_half = reinterpret_cast<__half2*>(&vals_arr);
+    __half2* output_half = reinterpret_cast<__half2*>(&output_arr);
 
-   hipLaunchKernelGGL(( Transpose_Kernel<__half>), dim3((rows * cols + threads - 1) / threads), dim3(threads), 0, stream, 
-        inp_mat, out_mat, cols, rows);
+    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
+    float4* output_vec =
+        reinterpret_cast<float4*>(cnt == 0 ? output : (cnt == 1 ? k_cache : v_cache));
+
+    vals_vec += (d0 * d0_stride * (gridDim.z / head_ext));
+    vals_vec += (d1 * d1_stride * (gridDim.z / head_ext));
+    vals_vec += (cnt * d1_stride);
+    vals_vec += (d2 * d2_stride);
+
+    output_vec += (d1 * d2_stride);
+    output_vec += (d0 * d0_out_stride);
+    output_vec += (d2 * d2_out_stride);
+
+    unsigned seq_id = d1 + seq_offset;
+
+    int lane = d3 & 0x1f;
+    if (cnt < 2 && rotary_dim > 0 && d3 < rotary_dim) {
+        float4 q = vals_vec[d3];
+        __half2* q_h = reinterpret_cast<__half2*>(&q);
+        if (rotate_every_two) {
+#pragma unroll
+            for (int o = 0; o < 4; o++) {
+                float inv_freq = (float)(((d3 << 2) + o) * 2) / (float)(rotary_dim << 3);
+                inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
+                float q_data[2];
+                q_data[0] = (float)q_h[o].x;
+                q_data[1] = (float)q_h[o].y;
+                q_h[o].x = (__half)(-1.0 * q_data[1] * sinf(inv_freq) + q_data[0] * cosf(inv_freq));
+                q_h[o].y = (__half)(q_data[0] * sinf(inv_freq) + q_data[1] * cosf(inv_freq));
+            }
+        }
+        output_vec[d3] = q;
+    } else
+        output_vec[d3] = vals_vec[d3];
 }
 
+// [B S C*H] - > C * [B A S N]
 template <>
-void Transpose<float>(const float* inp_mat, float* out_mat, int rows, int cols, hipStream_t stream)
+void launch_bias_add_transform_0213<float>(float* output,
+                                           float* k_cache,
+                                           float* v_cache,
+                                           const float* vals,
+                                           const float* bias,
+                                           int batch_size,
+                                           int seq_length,
+                                           unsigned seq_offset,
+                                           int all_tokens,
+                                           int hidden_dim,
+                                           int heads,
+                                           int rotary_dim,
+                                           bool rotate_half,
+                                           bool rotate_every_two,
+                                           hipStream_t stream,
+                                           int trans_count,
+                                           int max_out_tokens)
 {
-    int threads = THREADS;
+    hidden_dim >>= 2;
+    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
 
-   hipLaunchKernelGGL(( Transpose_Kernel<float>), dim3((rows * cols + threads - 1) / threads), dim3(threads), 0, stream, 
-        inp_mat, out_mat, cols, rows);
-}
+    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
+    dim3 grid_dim(batch_size, seq_length, (trans_count * head_ext));
 
+   hipLaunchKernelGGL(( bias_add_transform_0213), dim3(grid_dim), dim3(block_dim), 0, stream, output,
+                                                                k_cache,
+                                                                v_cache,
+                                                                vals,
+                                                                bias,
+                                                                hidden_dim,
+                                                                seq_length,
+                                                                seq_offset,
+                                                                heads,
+                                                                rotary_dim >> 2,
+                                                                rotate_half,
+                                                                rotate_every_two,
+                                                                head_ext,
+                                                                max_out_tokens);
+}
 template <typename T>
-__global__ void transform_0213(T* output,
-                               const T* vals,
-                               int hidden_dim,
-                               int seq_length,
-                               int heads,
-                               int head_ext);
-
+void launch_bias_add_transform_0213(T* outputs,
+                                    T* vals,
+                                    T* vals1,
+                                    const T* vals2,
+                                    const T* bias,
+                                    int batch_size,
+                                    int seq_length,
+                                    unsigned seq_offset,
+                                    int seq_length1,
+                                    int hidden_dim,
+                                    int heads,
+                                    int rotary_dim,
+                                    bool rotate_half,
+                                    bool rotate_every_two,
+                                    hipStream_t stream,
+                                    int trans_count,
+                                    int max_out_tokens);
 template <>
-__global__ void transform_0213<float>(float* output,
-                                      const float* vals,
-                                      int hidden_dim,
-                                      int seq_length,
-                                      int heads,
-                                      int head_ext)
+void launch_bias_add_transform_0213<__half>(__half* output,
+                                            __half* k_cache,
+                                            __half* v_cache,
+                                            const __half* vals,
+                                            const __half* bias,
+                                            int batch_size,
+                                            int seq_length,
+                                            unsigned seq_offset,
+                                            int all_tokens,
+                                            int hidden_dim,
+                                            int heads,
+                                            int rotary_dim,
+                                            bool rotate_half,
+                                            bool rotate_every_two,
+                                            hipStream_t stream,
+                                            int trans_count,
+                                            int max_out_tokens)
 {
-    int d0_stride = hidden_dim * seq_length;
-    int d1_stride = hidden_dim;
-    int d2_stride = hidden_dim / heads;
-
-    int d0_out_stride = d0_stride;
-    int d1_out_stride = d2_stride;
-    int d2_out_stride = d2_stride * seq_length;
-
-    int d0 = blockIdx.x;                                                  // Batch
-    int d1 = blockIdx.y / head_ext;                                       // Sequence ID (0-127)
-    int d2 = threadIdx.y + (blockIdx.y % head_ext) * (heads / head_ext);  // Head (0-11)
-    int d3 = threadIdx.x;                                                 // Values (groups of 4)
+    hidden_dim >>= 3;
+    int head_ext = 1;  // (hidden_dim - 1) / MAX_THREADS + 1;
+    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
+    dim3 grid_dim(batch_size, seq_length, (trans_count * head_ext));
+   hipLaunchKernelGGL(( bias_add_transform_0213), dim3(grid_dim), dim3(block_dim), 0, stream, output,
+                                                                k_cache,
+                                                                v_cache,
+                                                                vals,
+                                                                bias,
+                                                                hidden_dim,
+                                                                seq_length,
+                                                                seq_offset,
+                                                                all_tokens,
+                                                                heads,
+                                                                rotary_dim >> 3,
+                                                                rotate_half,
+                                                                rotate_every_two,
+                                                                head_ext,
+                                                                max_out_tokens);
+}
 
-    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
-    float4* output_vec = reinterpret_cast<float4*>(output);
+// Bias add
 
-    float4 inputs = vals_vec[d0 * d0_stride + d1 * d1_stride + d2 * d2_stride + d3];
-    output_vec[d0 * d0_out_stride + d1 * d1_out_stride + d2 * d2_out_stride + d3] = inputs;
+__global__ void pad_add_transform_0213(float* output,
+                                       const float* vals,
+                                       int hidden_dim,
+                                       int seq_length,
+                                       int padded_seq_len,
+                                       int heads,
+                                       int padded_head_size)
+{
 }
 
-template <>
-__global__ void transform_0213<__half>(__half* output,
+__global__ void pad_add_transform_0213(__half* output,
                                        const __half* vals,
                                        int hidden_dim,
                                        int seq_length,
+                                       int padded_seq_len,
                                        int heads,
-                                       int head_ext)
+                                       int padded_head_size)
 {
-#ifdef HALF_PRECISION_AVAILABLE
+    float4 ZERO;
+    const __half2 zero_h = __float2half2_rn(0.f);
+    __half2* ZERO_h = reinterpret_cast<__half2*>(&ZERO);
+#pragma unroll
+    for (int i = 0; i < 4; i++) ZERO_h[i] = zero_h;
 
     int d0_stride = hidden_dim * seq_length;
     int d1_stride = hidden_dim;
     int d2_stride = hidden_dim / heads;
 
-    int d0_out_stride = d0_stride;
-    int d1_out_stride = d2_stride;
-    int d2_out_stride = d2_stride * seq_length;
-
-    int d0 = blockIdx.x;                                                  // Batch
-    int d1 = blockIdx.y / head_ext;                                       // Sequence ID (0-127)
-    int d2 = threadIdx.y + (blockIdx.y % head_ext) * (heads / head_ext);  // Head (0-11)
-    int d3 = threadIdx.x;                                                 // Values (groups of 4)
+    int d0 = blockIdx.x;                             // Batch
+    int d1 = blockIdx.y * blockDim.z + threadIdx.z;  // Sequence ID (0-127)
+    int d2 = threadIdx.y;                            // Head (0-11)
+    int d3 = threadIdx.x;                            // Values (groups of 4)
 
-    float4 vals_arr[1];
+    int d2_out_stride = padded_head_size * padded_seq_len;
+    int d0_out_stride = heads * d2_out_stride;
 
     const float4* vals_vec = reinterpret_cast<const float4*>(vals);
     float4* output_vec = reinterpret_cast<float4*>(output);
 
-    vals_arr[0] = vals_vec[d0 * d0_stride + d1 * d1_stride + d2 * d2_stride + d3];
-    output_vec[d0 * d0_out_stride + d1 * d1_out_stride + d2 * d2_out_stride + d3] = vals_arr[0];
-#endif
-}
+    vals_vec += (d0 * d0_stride);
+    vals_vec += (d1 * d1_stride);
+    vals_vec += (d2 * d2_stride);
 
-template <>
-void launch_transform_0213<float>(float* output,
-                                  const float* vals,
-                                  int batch_size,
-                                  int seq_length,
-                                  int hidden_dim,
-                                  int heads,
-                                  hipStream_t stream)
-{
-    hidden_dim >>= 2;
-    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
-    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
-    dim3 grid_dim(batch_size, (seq_length * head_ext));
+    output_vec += (d1 * padded_head_size);
+    output_vec += (d0 * d0_out_stride);
+    output_vec += (d2 * d2_out_stride);
 
-   hipLaunchKernelGGL(( transform_0213<float>)
-        , dim3(grid_dim), dim3(block_dim), 0, stream, output, vals, hidden_dim, seq_length, heads, head_ext);
+    if (d3 < d2_stride && d1 < seq_length)
+        output_vec[d3] = vals_vec[d3];
+    else
+        output_vec[d3] = ZERO;
 }
 
-template <>
-void launch_transform_0213<__half>(__half* output,
-                                   const __half* vals,
+template <typename T>
+void launch_pad_add_transform_0213(T* output,
+                                   const T* vals,
                                    int batch_size,
-                                   int seq_length,
                                    int hidden_dim,
+                                   int seq_length,
+                                   int padded_seq_len,
                                    int heads,
-                                   hipStream_t stream)
+                                   int padded_head_size,
+                                   hipStream_t stream);
+
+// [B S C*H] - > C * [B A S N]
+template <>
+void launch_pad_add_transform_0213<float>(float* output,
+                                          const float* vals,
+                                          int batch_size,
+                                          int hidden_dim,
+                                          int seq_length,
+                                          int padded_seq_len,
+                                          int heads,
+                                          int padded_head_size,
+                                          hipStream_t stream)
+{
+}
+template <>
+void launch_pad_add_transform_0213<__half>(__half* output,
+                                           const __half* vals,
+                                           int batch_size,
+                                           int hidden_dim,
+                                           int seq_length,
+                                           int padded_seq_len,
+                                           int heads,
+                                           int padded_head_size,
+                                           hipStream_t stream)
 {
     hidden_dim >>= 3;
-    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
-    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
-    dim3 grid_dim(batch_size, (seq_length * head_ext));
-   hipLaunchKernelGGL(( transform_0213<__half>)
-        , dim3(grid_dim), dim3(block_dim), 0, stream, output, vals, hidden_dim, seq_length, heads, head_ext);
+    dim3 block_dim((padded_head_size >> 3), heads, 2);
+    dim3 grid_dim(batch_size, padded_seq_len / 2);
+   hipLaunchKernelGGL(( pad_add_transform_0213), dim3(grid_dim), dim3(block_dim), 0, stream, 
+        output, vals, hidden_dim, seq_length, padded_seq_len, heads, padded_head_size >> 3);
 }
 
 // Bias add
@@ -209,9 +395,6 @@ __global__ void bias_add_transform_0213<float>(float* output,
                d2 * d2_out_stride + d3] = outputs;
 }
 
-#define ATTN_H 3
-#define MAX_SEQ_LINE 10
-
 template <>
 __global__ void bias_add_transform_0213<__half>(__half* output,
                                                 const __half* vals,
@@ -221,8 +404,6 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
                                                 int heads,
                                                 int head_ext)
 {
-#ifdef HALF_PRECISION_AVAILABLE
-
     int d0_stride = hidden_dim * seq_length;
     int d1_stride = hidden_dim;
     int d2_stride = hidden_dim / heads;
@@ -262,26 +443,11 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
     bias_arr = bias_vec[d3];
     vals_arr = vals_vec[d3];
 
-#if defined(__ACC_HALF__)
     output_half[0] = vals_half[0] + bias_half[0];
     output_half[1] = vals_half[1] + bias_half[1];
     output_half[2] = vals_half[2] + bias_half[2];
     output_half[3] = vals_half[3] + bias_half[3];
-#else
-    float2 bias_arr_f[4];
-    float2 vals_arr_f[4];
-#pragma unroll
-    for (int l = 0; l < 4; l++) {
-        bias_arr_f[l] = __half22float2(bias_half[l]);
-        vals_arr_f[l] = __half22float2(vals_half[l]);
-        vals_arr_f[l].x += bias_arr_f[l].x;
-        vals_arr_f[l].y += bias_arr_f[l].y;
-        output_half[l] = __float22half2_rn(vals_arr_f[l]);
-    }
-#endif
     output_vec[d3] = output_arr;
-
-#endif
 }
 
 __global__ void bias_add_transform_0213_v2(__half* output,
@@ -291,7 +457,6 @@ __global__ void bias_add_transform_0213_v2(__half* output,
                                            int seq_length,
                                            int heads)
 {
-#ifdef HALF_PRECISION_AVAILABLE
     __shared__ float4 in_data[3072];
 
     int d0_stride = hidden_dim * seq_length;
@@ -353,55 +518,6 @@ __global__ void bias_add_transform_0213_v2(__half* output,
         output_vec[out_index + iter_offset] =
             in_data[iter_row * d2_stride + d3 + (d2 % 2) * (d1_stride * blockDim.z)];
     }
-#endif
-}
-
-// [B S C*H] - > C * [B A S N]
-template <>
-void launch_bias_add_transform_0213<float>(float* output,
-                                           const float* vals,
-                                           const float* bias,
-                                           int batch_size,
-                                           int seq_length,
-                                           int hidden_dim,
-                                           int heads,
-                                           hipStream_t stream,
-                                           int trans_count)
-{
-    hidden_dim >>= 2;
-    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
-
-    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
-    dim3 grid_dim(batch_size, seq_length, (trans_count * head_ext));
-
-   hipLaunchKernelGGL(( bias_add_transform_0213<float>), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        output, vals, bias, hidden_dim, seq_length, heads, head_ext);
-}
-
-template <>
-void launch_bias_add_transform_0213<__half>(__half* output,
-                                            const __half* vals,
-                                            const __half* bias,
-                                            int batch_size,
-                                            int seq_length,
-                                            int hidden_dim,
-                                            int heads,
-                                            hipStream_t stream,
-                                            int trans_count)
-{
-    hidden_dim >>= 3;
-    if (hidden_dim > 128 || hidden_dim < 16) {
-        int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
-        dim3 block_dim(hidden_dim / heads, (heads / head_ext));
-        dim3 grid_dim(batch_size, seq_length, (trans_count * head_ext));
-       hipLaunchKernelGGL(( bias_add_transform_0213<__half>), dim3(grid_dim), dim3(block_dim), 0, stream, 
-            output, vals, bias, hidden_dim, seq_length, heads, head_ext);
-    } else {
-        dim3 block_dim(hidden_dim / heads, heads, trans_count);
-        dim3 grid_dim(batch_size, seq_length / 2);
-       hipLaunchKernelGGL(( bias_add_transform_0213_v2), dim3(grid_dim), dim3(block_dim), 0, stream, 
-            output, vals, bias, hidden_dim, seq_length, heads);
-    }
 }
 
 template <typename T>
@@ -453,8 +569,6 @@ __global__ void transform4d_0213<__half>(__half* out,
                                          int hidden_dim,
                                          int head_ext)
 {
-#ifdef HALF_PRECISION_AVAILABLE
-
     int d0_stride = hidden_dim * (seq_length / head_ext);
     int d1_stride = hidden_dim;
     int d2_stride = hidden_dim / heads;
@@ -479,8 +593,6 @@ __global__ void transform4d_0213<__half>(__half* out,
     out_vec += (d2 * d1_stride * gridDim.y);
 
     out_vec[d3] = in_vec[d3];
-
-#endif
 }
 
 __global__ void transform4d_0213_v2(__half* out,
@@ -489,7 +601,6 @@ __global__ void transform4d_0213_v2(__half* out,
                                     int seq_length,
                                     int hidden_dim)
 {
-#ifdef HALF_PRECISION_AVAILABLE
     __shared__ float4 in_data[3072];
 
     int d0_stride = hidden_dim * seq_length;
@@ -530,7 +641,6 @@ __global__ void transform4d_0213_v2(__half* out,
         int iter_id = iter * iteration_stride + iter_index;
         out_vec[output_offset + iter_id] = in_data[iter_id];
     }
-#endif
 }
 
 // 3 * [B A S N] - > [B S C*H]
@@ -562,16 +672,9 @@ void launch_transform4d_0213<__half>(__half* out,
                                      int trans_count)
 {
     hidden_dim >>= 3;
-    if (hidden_dim > 128 || hidden_dim < 16) {
-        int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
-        dim3 grid_dims(batch_size, trans_count, (seq_length * head_ext));
-        dim3 block_dims(hidden_dim / heads, (heads / head_ext));
-       hipLaunchKernelGGL(( transform4d_0213<__half>), dim3(grid_dims), dim3(block_dims), 0, stream, 
-            out, in, heads, seq_length, hidden_dim, head_ext);
-    } else {
-        dim3 grid_dims(batch_size, seq_length / 2);
-        dim3 block_dims(hidden_dim / heads, heads, trans_count);
-       hipLaunchKernelGGL(( transform4d_0213_v2), dim3(grid_dims), dim3(block_dims), 0, stream, 
-            out, in, heads, seq_length, hidden_dim);
-    }
+    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
+    dim3 grid_dims(batch_size, trans_count, (seq_length * head_ext));
+    dim3 block_dims(hidden_dim / heads, (heads / head_ext));
+   hipLaunchKernelGGL(( transform4d_0213<__half>)
+        , dim3(grid_dims), dim3(block_dims), 0, stream, out, in, heads, seq_length, hidden_dim, head_ext);
 }
diff --git a/deepspeed/ops/csrc/transformer/inference/includes/context.h b/csrc/transformer/inference/includes/inference_context.h
similarity index 53%
rename from deepspeed/ops/csrc/transformer/inference/includes/context.h
rename to csrc/transformer/inference/includes/inference_context.h
index 21f0b3cfe07b3f5f519af7a1b3a4daa4f7b88424..b3851ca43b725dda388de7adc86048282904a176 100644
--- a/deepspeed/ops/csrc/transformer/inference/includes/context.h
+++ b/csrc/transformer/inference/includes/inference_context.h
@@ -1,14 +1,21 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
 #pragma once
 
-#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAStream.h>
 #include <cuda_runtime_api.h>
 #include <cassert>
 #include <iostream>
 #include <vector>
 #include "cublas_v2.h"
 #include "cuda.h"
-#include "curand.h"
 
+#define MEGABYTE (1024 * 1024)
+#define GIGABYTE (1024 * 1024 * 1024)
+
+// TODO: refactor out
 #define WARP_SIZE 32
 
 #define CUDA_CHECK(callstr)                                                                    \
@@ -40,10 +47,15 @@ inline int DS_GET_BLOCKS(const int N)
 
 class Context {
 public:
-    Context() : _workspace(nullptr), _seed(42), _curr_offset(0), _stream(0)
+    Context()
+        : _workspace(nullptr),
+          _seed(42),
+          _curr_offset(0),
+          _stream(0),
+          _free_memory_size(0),
+          _num_tokens(1),
+          _attention_unfused_workspace_offset(0)
     {
-        curandCreateGenerator(&_gen, CURAND_RNG_PSEUDO_DEFAULT);
-        curandSetPseudoRandomGeneratorSeed(_gen, 123);
         if (cublasCreate(&_cublasHandle) != CUBLAS_STATUS_SUCCESS) {
             auto message = std::string("Fail to create cublas handle.");
             std::cerr << message << std::endl;
@@ -51,16 +63,11 @@ public:
         }
 #ifndef __HIP_PLATFORM_HCC__
         cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
-        cudaEventCreate(&_comp1_event, (cudaEventDisableTiming | cudaEventBlockingSync));
-        cudaEventCreate(&_comp2_event, (cudaEventDisableTiming | cudaEventBlockingSync));
-        cudaEventCreate(&_comp_event, (cudaEventDisableTiming | cudaEventBlockingSync));
-        cudaEventCreate(&_comm_event, (cudaEventDisableTiming | cudaEventBlockingSync));
-#else
+#endif
         cudaEventCreate(&_comp1_event);
         cudaEventCreate(&_comp2_event);
         cudaEventCreate(&_comp_event);
         cudaEventCreate(&_comm_event);
-#endif
     }
 
     virtual ~Context()
@@ -79,23 +86,88 @@ public:
         return _ctx;
     }
 
-    void GenWorkSpace(size_t size)
+    void GenWorkSpace(const unsigned& num_layers,
+                      const unsigned& num_heads,
+                      const size_t& batch_size,
+                      const size_t& prompt_len,
+                      const size_t& hidden_dim,
+                      const unsigned& mp_size,
+                      const bool& external_cache,
+                      const size_t& elem_size,
+                      const unsigned& rank,
+                      unsigned max_out_tokens)
     {
+        size_t total_size;
+        if (!_free_memory_size) { cudaMemGetInfo(&_free_memory_size, &total_size); }
+
+        // Flash attention requires padded heads and we'll conservatively allocate
+        // for that here. Flash attention is only enabled for head size <= 128 right now
+        const int head_size = hidden_dim / num_heads;
+        const int padded_head_size = head_size <= 32 ? 32 : (head_size <= 64 ? 64 : 128);
+        const int effective_head_size = (head_size > 128) ? head_size : padded_head_size;
+
+        size_t activation_size = 16 * (num_heads * effective_head_size) * batch_size;
+        // Other sequence length dimension is added when the final workSpaceSize is calculated
+        size_t temp_size = batch_size * num_heads * max_out_tokens * 2;
+        size_t cache_size =
+            num_layers * batch_size * ((num_heads * effective_head_size) / mp_size) * 2;
+        size_t minimal_requirements =
+            temp_size + (_free_memory_size > GIGABYTE ? 500 : 100) * MEGABYTE;
+        if (_free_memory_size < minimal_requirements) {
+            printf("Requested:\t%lu\nFree:\t%lu\nTotal:\t%lu\n",
+                   minimal_requirements,
+                   _free_memory_size,
+                   total_size);
+            throw std::runtime_error("Workspace can't be allocated, no enough memory.");
+        }
+
+        _max_seq_len = ((_free_memory_size - minimal_requirements) / elem_size) /
+                       (activation_size + temp_size + cache_size);
+        _max_seq_len = std::min((size_t)max_out_tokens, _max_seq_len);
+        size_t workSpaceSize = ((external_cache ? (activation_size + temp_size)
+                                                : (activation_size + temp_size + cache_size))) *
+                               _max_seq_len * elem_size;
+        temp_size *= _max_seq_len * elem_size;
+        if (rank == 0 && !_workspace)
+            printf(
+                "------------------------------------------------------\n"
+                "Free memory : %f (GigaBytes)  \n"
+                "Total memory: %f (GigaBytes)  \n"
+                "Requested memory: %f (GigaBytes) \n"
+                "Setting maximum total tokens (input + output) to %lu \n"
+                "------------------------------------------------------\n",
+                (float)_free_memory_size / GIGABYTE,
+                (float)total_size / GIGABYTE,
+                (float)workSpaceSize / GIGABYTE,
+                _max_seq_len);
         if (!_workspace) {
             assert(_workspace == nullptr);
-            cudaMalloc(&_workspace, size);
-        } else if (_workSpaceSize < size) {
+            cudaMalloc(&_workspace, workSpaceSize);
+        } else if (_workSpaceSize < workSpaceSize) {
             cudaFree(_workspace);
-            cudaMalloc(&_workspace, size);
+            cudaMalloc(&_workspace, workSpaceSize);
         }
 
-        _workSpaceSize = size;
+        if (!_workspace) {
+            printf("Requested:\t%lu\nFree:\t%lu\nTotal:\t%lu\n",
+                   workSpaceSize,
+                   _free_memory_size,
+                   total_size);
+            throw std::runtime_error("Workspace is null.");
+        }
+        _workSpaceSize = workSpaceSize;
+        _attention_unfused_workspace_offset = workSpaceSize - temp_size;
     }
+    inline size_t GetMaxTokenLenght() const { return _max_seq_len; }
 
     cudaEvent_t GetCompEvent(int id) { return id == 1 ? _comp1_event : _comp2_event; }
 
     size_t get_workspace_size() const { return _workSpaceSize; }
     void* GetWorkSpace() { return _workspace; }
+    void* GetAttentionUnfusedWorkspace()
+    {
+        return (char*)_workspace + _attention_unfused_workspace_offset;
+    }
 
     inline unsigned new_token(unsigned layer_id)
     {
@@ -103,7 +175,7 @@ public:
         return _token_length;
     }
 
-    inline void reset_tokens(unsigned initial_tokens = 0)
+    inline void reset_tokens(unsigned initial_tokens = 1)
     {
         _num_tokens = initial_tokens;
     }  //_token_length = 0; }
@@ -112,8 +184,6 @@ public:
 
     inline void advance_tokens() { _num_tokens++; }
 
-    curandGenerator_t& GetRandGenerator() { return _gen; }
-
     cudaStream_t GetCommStream(bool async_op = false)
     {
         if (!_comm_stream)
@@ -157,16 +227,21 @@ public:
     }
 
 private:
-    curandGenerator_t _gen;
     cublasHandle_t _cublasHandle;
 
     cudaEvent_t _comp_event;
     cudaEvent_t _comm_event;
 
     void* _workspace;
+    // offset from _workspace for attention unfused memory
+    size_t _attention_unfused_workspace_offset;
     uint64_t _seed;
     uint64_t _curr_offset;
+
     size_t _workSpaceSize;
+    size_t _free_memory_size;
+
+    size_t _max_seq_len;
 
     cudaEvent_t _comp1_event;
     cudaEvent_t _comp2_event;
diff --git a/deepspeed/ops/csrc/transformer/inference/includes/context_hip.h b/csrc/transformer/inference/includes/inference_context_hip.h
similarity index 54%
rename from deepspeed/ops/csrc/transformer/inference/includes/context_hip.h
rename to csrc/transformer/inference/includes/inference_context_hip.h
index 738e2dcd61e7ef8c11afcfdf7d4385299307469a..5bb968d2e2f92df7797c72046a79ede642fd4d45 100644
--- a/deepspeed/ops/csrc/transformer/inference/includes/context_hip.h
+++ b/csrc/transformer/inference/includes/inference_context_hip.h
@@ -1,15 +1,22 @@
 // !!! This is a file automatically generated by hipify!!!
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
 #pragma once
 
-#include <ATen/hip/HIPContext.h>
+#include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
 #include <hip/hip_runtime_api.h>
 #include <cassert>
 #include <iostream>
 #include <vector>
 #include "rocblas.h"
 #include "hip/hip_runtime.h"
-#include "hiprand/hiprand.h"
 
+#define MEGABYTE (1024 * 1024)
+#define GIGABYTE (1024 * 1024 * 1024)
+
+// TODO: refactor out
 #define WARP_SIZE 32
 
 #define CUDA_CHECK(callstr)                                                                    \
@@ -41,10 +48,15 @@ inline int DS_GET_BLOCKS(const int N)
 
 class Context {
 public:
-    Context() : _workspace(nullptr), _seed(42), _curr_offset(0), _stream(0)
+    Context()
+        : _workspace(nullptr),
+          _seed(42),
+          _curr_offset(0),
+          _stream(0),
+          _free_memory_size(0),
+          _num_tokens(1),
+          _attention_unfused_workspace_offset(0)
     {
-        hiprandCreateGenerator(&_gen, HIPRAND_RNG_PSEUDO_DEFAULT);
-        hiprandSetPseudoRandomGeneratorSeed(_gen, 123);
         if (rocblas_create_handle(&_cublasHandle) != rocblas_status_success) {
             auto message = std::string("Fail to create cublas handle.");
             std::cerr << message << std::endl;
@@ -52,16 +64,11 @@ public:
         }
 #ifndef __HIP_PLATFORM_HCC__
         rocblas_set_math_mode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
-        hipEventCreate(&_comp1_event, (hipEventDisableTiming | hipEventBlockingSync));
-        hipEventCreate(&_comp2_event, (hipEventDisableTiming | hipEventBlockingSync));
-        hipEventCreate(&_comp_event, (hipEventDisableTiming | hipEventBlockingSync));
-        hipEventCreate(&_comm_event, (hipEventDisableTiming | hipEventBlockingSync));
-#else
+#endif
         hipEventCreate(&_comp1_event);
         hipEventCreate(&_comp2_event);
         hipEventCreate(&_comp_event);
         hipEventCreate(&_comm_event);
-#endif
     }
 
     virtual ~Context()
@@ -80,23 +87,88 @@ public:
         return _ctx;
     }
 
-    void GenWorkSpace(size_t size)
+    void GenWorkSpace(const unsigned& num_layers,
+                      const unsigned& num_heads,
+                      const size_t& batch_size,
+                      const size_t& prompt_len,
+                      const size_t& hidden_dim,
+                      const unsigned& mp_size,
+                      const bool& external_cache,
+                      const size_t& elem_size,
+                      const unsigned& rank,
+                      unsigned max_out_tokens)
     {
+        size_t total_size;
+        if (!_free_memory_size) { hipMemGetInfo(&_free_memory_size, &total_size); }
+
+        // Flash attention requires padded heads and we'll conservatively allocate
+        // for that here. Flash attention is only enabled for head size <= 128 right now
+        const int head_size = hidden_dim / num_heads;
+        const int padded_head_size = head_size <= 32 ? 32 : (head_size <= 64 ? 64 : 128);
+        const int effective_head_size = (head_size > 128) ? head_size : padded_head_size;
+
+        size_t activation_size = 16 * (num_heads * effective_head_size) * batch_size;
+        // Other sequence length dimension is added when the final workSpaceSize is calculated
+        size_t temp_size = batch_size * num_heads * max_out_tokens * 2;
+        size_t cache_size =
+            num_layers * batch_size * ((num_heads * effective_head_size) / mp_size) * 2;
+        size_t minimal_requirements =
+            temp_size + (_free_memory_size > GIGABYTE ? 500 : 100) * MEGABYTE;
+        if (_free_memory_size < minimal_requirements) {
+            printf("Requested:\t%lu\nFree:\t%lu\nTotal:\t%lu\n",
+                   minimal_requirements,
+                   _free_memory_size,
+                   total_size);
+            throw std::runtime_error("Workspace can't be allocated, no enough memory.");
+        }
+
+        _max_seq_len = ((_free_memory_size - minimal_requirements) / elem_size) /
+                       (activation_size + temp_size + cache_size);
+        _max_seq_len = std::min((size_t)max_out_tokens, _max_seq_len);
+        size_t workSpaceSize = ((external_cache ? (activation_size + temp_size)
+                                                : (activation_size + temp_size + cache_size))) *
+                               _max_seq_len * elem_size;
+        temp_size *= _max_seq_len * elem_size;
+        if (rank == 0 && !_workspace)
+            printf(
+                "------------------------------------------------------\n"
+                "Free memory : %f (GigaBytes)  \n"
+                "Total memory: %f (GigaBytes)  \n"
+                "Requested memory: %f (GigaBytes) \n"
+                "Setting maximum total tokens (input + output) to %lu \n"
+                "------------------------------------------------------\n",
+                (float)_free_memory_size / GIGABYTE,
+                (float)total_size / GIGABYTE,
+                (float)workSpaceSize / GIGABYTE,
+                _max_seq_len);
         if (!_workspace) {
             assert(_workspace == nullptr);
-            hipMalloc(&_workspace, size);
-        } else if (_workSpaceSize < size) {
+            hipMalloc(&_workspace, workSpaceSize);
+        } else if (_workSpaceSize < workSpaceSize) {
             hipFree(_workspace);
-            hipMalloc(&_workspace, size);
+            hipMalloc(&_workspace, workSpaceSize);
         }
 
-        _workSpaceSize = size;
+        if (!_workspace) {
+            printf("Requested:\t%lu\nFree:\t%lu\nTotal:\t%lu\n",
+                   workSpaceSize,
+                   _free_memory_size,
+                   total_size);
+            throw std::runtime_error("Workspace is null.");
+        }
+        _workSpaceSize = workSpaceSize;
+        _attention_unfused_workspace_offset = workSpaceSize - temp_size;
     }
+    inline size_t GetMaxTokenLenght() const { return _max_seq_len; }
 
     hipEvent_t GetCompEvent(int id) { return id == 1 ? _comp1_event : _comp2_event; }
 
     size_t get_workspace_size() const { return _workSpaceSize; }
     void* GetWorkSpace() { return _workspace; }
+    void* GetAttentionUnfusedWorkspace()
+    {
+        return (char*)_workspace + _attention_unfused_workspace_offset;
+    }
 
     inline unsigned new_token(unsigned layer_id)
     {
@@ -104,7 +176,7 @@ public:
         return _token_length;
     }
 
-    inline void reset_tokens(unsigned initial_tokens = 0)
+    inline void reset_tokens(unsigned initial_tokens = 1)
     {
         _num_tokens = initial_tokens;
     }  //_token_length = 0; }
@@ -113,8 +185,6 @@ public:
 
     inline void advance_tokens() { _num_tokens++; }
 
-    hiprandGenerator_t& GetRandGenerator() { return _gen; }
-
     hipStream_t GetCommStream(bool async_op = false)
     {
         if (!_comm_stream)
@@ -158,16 +228,21 @@ public:
     }
 
 private:
-    hiprandGenerator_t _gen;
     rocblas_handle _cublasHandle;
 
     hipEvent_t _comp_event;
     hipEvent_t _comm_event;
 
     void* _workspace;
+    // offset from _workspace for attention unfused memory
+    size_t _attention_unfused_workspace_offset;
     uint64_t _seed;
     uint64_t _curr_offset;
+
     size_t _workSpaceSize;
+    size_t _free_memory_size;
+
+    size_t _max_seq_len;
 
     hipEvent_t _comp1_event;
     hipEvent_t _comp2_event;
diff --git a/deepspeed/ops/csrc/transformer/inference/includes/cublas_wrappers.h b/csrc/transformer/inference/includes/inference_cublas_wrappers.h
similarity index 99%
rename from deepspeed/ops/csrc/transformer/inference/includes/cublas_wrappers.h
rename to csrc/transformer/inference/includes/inference_cublas_wrappers.h
index 75d18a40fc8e468c3ddcc5b1ae8bbdfc421c7072..9e55cc1c7423ade02b681ec900e107aeb7a50a8c 100644
--- a/deepspeed/ops/csrc/transformer/inference/includes/cublas_wrappers.h
+++ b/csrc/transformer/inference/includes/inference_cublas_wrappers.h
@@ -1,3 +1,7 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
 #pragma once
 
 #include <assert.h>
diff --git a/deepspeed/ops/csrc/transformer/inference/includes/cublas_wrappers_hip.h b/csrc/transformer/inference/includes/inference_cublas_wrappers_hip.h
similarity index 99%
rename from deepspeed/ops/csrc/transformer/inference/includes/cublas_wrappers_hip.h
rename to csrc/transformer/inference/includes/inference_cublas_wrappers_hip.h
index e7c81906bd790e200e6401f66c99dbcac2a0cbc5..14933988f644eebbe87943bdbfe1d10b4c3c67e9 100644
--- a/deepspeed/ops/csrc/transformer/inference/includes/cublas_wrappers_hip.h
+++ b/csrc/transformer/inference/includes/inference_cublas_wrappers_hip.h
@@ -1,4 +1,8 @@
 // !!! This is a file automatically generated by hipify!!!
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
 #pragma once
 
 #include <assert.h>
diff --git a/csrc/transformer/inference/includes/inference_cuda_layers.h b/csrc/transformer/inference/includes/inference_cuda_layers.h
new file mode 100644
index 0000000000000000000000000000000000000000..211f56bc8a76b68e2f58a005a2523a7d2d1dfa81
--- /dev/null
+++ b/csrc/transformer/inference/includes/inference_cuda_layers.h
@@ -0,0 +1,222 @@
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#pragma once
+
+#include "ds_kernel_utils.h"
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cassert>
+#include <iostream>
+
+#define MAX_WARP_NUM 32
+#define WARP_SIZE 32
+
+#define MAX_THREADS 1024
+#define SMs 80
+
+#define MAX_REGISTERS 256
+
+template <typename T>
+void launch_attn_softmax_v2(T* vals,
+                            T* mask,
+                            T* alibi,
+                            float layer_scale,
+                            bool triangular,
+                            bool recompute,
+                            bool local_attention,
+                            int window_size,
+                            int batch_size,
+                            int heads,
+                            int num_seq,
+                            int sequence_length,
+                            int offset,
+                            int mask_stride,
+                            int mp_size,
+                            cudaStream_t stream);
+
+// Fused bias add with gelu activation
+template <typename T>
+void launch_bias_gelu(T* input,
+                      const T* bias,
+                      int intermediate_size,
+                      int batch_size,
+                      cudaStream_t stream);
+
+template <typename T>
+void launch_fused_bias_geglu(T* output,
+                             const T* activation,
+                             const T* bias,
+                             int rows,
+                             int elems_per_row,
+                             cudaStream_t stream);
+
+// Fused bias add with relu activation
+template <typename T>
+void launch_bias_relu(T* input,
+                      const T* bias,
+                      int intermediate_size,
+                      int batch_size,
+                      cudaStream_t stream);
+
+template <typename T>
+void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, cudaStream_t stream);
+
+template <typename T>
+void launch_bias_residual(T* input,
+                          T* output,
+                          T* attn,
+                          T* bias,
+                          T* attn_bias,
+                          int batch,
+                          int hidden_dim,
+                          int mp_size,
+                          bool preln,
+                          cudaStream_t stream);
+
+template <typename T>
+void launch_fused_ln(T* output,
+                     const T* vals,
+                     const T* gamma,
+                     const T* beta,
+                     float epsilon,
+                     int rows,
+                     int elems_per_row,
+                     cudaStream_t stream);
+
+template <typename T>
+void launch_fused_residual_ln(T* output,
+                              const T* vals,
+                              const T* residual,
+                              const T* bias,
+                              const T* gamma,
+                              const T* beta,
+                              float epsilon,
+                              int rows,
+                              int elems_per_row,
+                              cudaStream_t stream);
+
+template <typename T>
+void launch_fused_residual_ln_store_pre_ln_res(T* norm_output,
+                                               T* res_output,
+                                               const T* vals,
+                                               const T* residual,
+                                               const T* bias,
+                                               const T* gamma,
+                                               const T* beta,
+                                               float epsilon,
+                                               int rows,
+                                               int elems_per_row,
+                                               cudaStream_t stream);
+
+template <typename T>
+void launch_dequantize(T* output,
+                       const int8_t* input,
+                       const float* qscale,
+                       unsigned output_size,
+                       unsigned hidden_dim,
+                       unsigned groups,
+                       unsigned merge_count,
+                       cudaStream_t stream);
+
+template <typename T>
+void launch_dequantize(T* output,
+                       const int8_t* input,
+                       const float* qscale,
+                       unsigned output_size,
+                       unsigned hidden_dim,
+                       unsigned groups,
+                       cudaStream_t stream);
+template <typename T>
+void launch_gptj_residual_add(T* input,
+                              T* output,
+                              T* attn,
+                              T* bias,
+                              T* attn_bias,
+                              int batch,
+                              int head_size,
+                              int mp_size,
+                              cudaStream_t stream);
+
+template <typename T>
+void launch_apply_rotary_pos_emb(T* mixed_query,
+                                 T* key_layer,
+                                 unsigned head_size,
+                                 unsigned seq_len,
+                                 unsigned rotary_dim,
+                                 unsigned offset,
+                                 unsigned num_heads,
+                                 unsigned batch,
+                                 bool rotate_half,
+                                 bool rotate_every_two,
+                                 cudaStream_t stream,
+                                 int max_out_tokens);
+
+template <typename T>
+void launch_moe_res_matmul(T* residual,
+                           T* coef,
+                           T* mlp_out,
+                           int seq_len,
+                           int hidden_dim,
+                           cudaStream_t stream);
+
+// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
+template <typename T>
+void launch_transform4d_0213(T* out,
+                             const T* in,
+                             int batch_size,
+                             int heads,
+                             int seq_length,
+                             int hidden_dim,
+                             cudaStream_t stream,
+                             int trans_count);
+template <typename T>
+void launch_bias_add_transform_0213(T* outputs,
+                                    T* vals,
+                                    T* vals1,
+                                    const T* vals2,
+                                    const T* bias,
+                                    int batch_size,
+                                    int seq_length,
+                                    unsigned seq_offset,
+                                    int seq_length1,
+                                    int hidden_dim,
+                                    int heads,
+                                    int rotary_dim,
+                                    bool rotate_half,
+                                    bool rotate_every_two,
+                                    cudaStream_t stream,
+                                    int trans_count,
+                                    int max_out_tokens);
+template <typename T>
+void pad_data(T* padded_output,
+              T* output,
+              int bsz,
+              int head_size,
+              int padded_head_size,
+              cudaStream_t stream);
+
+template <typename T>
+void pad_head_seq(T* padded_output,
+                  T* output,
+                  int bsz,
+                  int seq_len,
+                  int padded_seq_len,
+                  int head_size,
+                  int padded_head_size,
+                  cudaStream_t stream);
+
+template <typename T>
+void launch_pad_add_transform_0213(T* output,
+                                   const T* vals,
+                                   int batch_size,
+                                   int hidden_dim,
+                                   int seq_length,
+                                   int padded_seq_len,
+                                   int heads,
+                                   int padded_head_size,
+                                   cudaStream_t stream);
diff --git a/csrc/transformer/inference/includes/inference_hip_layers.h b/csrc/transformer/inference/includes/inference_hip_layers.h
new file mode 100644
index 0000000000000000000000000000000000000000..1059266f6c36b51f280d6820e73f6099811366e4
--- /dev/null
+++ b/csrc/transformer/inference/includes/inference_hip_layers.h
@@ -0,0 +1,223 @@
+// !!! This is a file automatically generated by hipify!!!
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#pragma once
+
+#include "ds_kernel_utils_hip.h"
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cassert>
+#include <iostream>
+
+#define MAX_WARP_NUM 32
+#define WARP_SIZE 32
+
+#define MAX_THREADS 1024
+#define SMs 80
+
+#define MAX_REGISTERS 256
+
+template <typename T>
+void launch_attn_softmax_v2(T* vals,
+                            T* mask,
+                            T* alibi,
+                            float layer_scale,
+                            bool triangular,
+                            bool recompute,
+                            bool local_attention,
+                            int window_size,
+                            int batch_size,
+                            int heads,
+                            int num_seq,
+                            int sequence_length,
+                            int offset,
+                            int mask_stride,
+                            int mp_size,
+                            hipStream_t stream);
+
+// Fused bias add with gelu activation
+template <typename T>
+void launch_bias_gelu(T* input,
+                      const T* bias,
+                      int intermediate_size,
+                      int batch_size,
+                      hipStream_t stream);
+
+template <typename T>
+void launch_fused_bias_geglu(T* output,
+                             const T* activation,
+                             const T* bias,
+                             int rows,
+                             int elems_per_row,
+                             hipStream_t stream);
+
+// Fused bias add with relu activation
+template <typename T>
+void launch_bias_relu(T* input,
+                      const T* bias,
+                      int intermediate_size,
+                      int batch_size,
+                      hipStream_t stream);
+
+template <typename T>
+void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, hipStream_t stream);
+
+template <typename T>
+void launch_bias_residual(T* input,
+                          T* output,
+                          T* attn,
+                          T* bias,
+                          T* attn_bias,
+                          int batch,
+                          int hidden_dim,
+                          int mp_size,
+                          bool preln,
+                          hipStream_t stream);
+
+template <typename T>
+void launch_fused_ln(T* output,
+                     const T* vals,
+                     const T* gamma,
+                     const T* beta,
+                     float epsilon,
+                     int rows,
+                     int elems_per_row,
+                     hipStream_t stream);
+
+template <typename T>
+void launch_fused_residual_ln(T* output,
+                              const T* vals,
+                              const T* residual,
+                              const T* bias,
+                              const T* gamma,
+                              const T* beta,
+                              float epsilon,
+                              int rows,
+                              int elems_per_row,
+                              hipStream_t stream);
+
+template <typename T>
+void launch_fused_residual_ln_store_pre_ln_res(T* norm_output,
+                                               T* res_output,
+                                               const T* vals,
+                                               const T* residual,
+                                               const T* bias,
+                                               const T* gamma,
+                                               const T* beta,
+                                               float epsilon,
+                                               int rows,
+                                               int elems_per_row,
+                                               hipStream_t stream);
+
+template <typename T>
+void launch_dequantize(T* output,
+                       const int8_t* input,
+                       const float* qscale,
+                       unsigned output_size,
+                       unsigned hidden_dim,
+                       unsigned groups,
+                       unsigned merge_count,
+                       hipStream_t stream);
+
+template <typename T>
+void launch_dequantize(T* output,
+                       const int8_t* input,
+                       const float* qscale,
+                       unsigned output_size,
+                       unsigned hidden_dim,
+                       unsigned groups,
+                       hipStream_t stream);
+template <typename T>
+void launch_gptj_residual_add(T* input,
+                              T* output,
+                              T* attn,
+                              T* bias,
+                              T* attn_bias,
+                              int batch,
+                              int head_size,
+                              int mp_size,
+                              hipStream_t stream);
+
+template <typename T>
+void launch_apply_rotary_pos_emb(T* mixed_query,
+                                 T* key_layer,
+                                 unsigned head_size,
+                                 unsigned seq_len,
+                                 unsigned rotary_dim,
+                                 unsigned offset,
+                                 unsigned num_heads,
+                                 unsigned batch,
+                                 bool rotate_half,
+                                 bool rotate_every_two,
+                                 hipStream_t stream,
+                                 int max_out_tokens);
+
+template <typename T>
+void launch_moe_res_matmul(T* residual,
+                           T* coef,
+                           T* mlp_out,
+                           int seq_len,
+                           int hidden_dim,
+                           hipStream_t stream);
+
+// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
+template <typename T>
+void launch_transform4d_0213(T* out,
+                             const T* in,
+                             int batch_size,
+                             int heads,
+                             int seq_length,
+                             int hidden_dim,
+                             hipStream_t stream,
+                             int trans_count);
+template <typename T>
+void launch_bias_add_transform_0213(T* outputs,
+                                    T* vals,
+                                    T* vals1,
+                                    const T* vals2,
+                                    const T* bias,
+                                    int batch_size,
+                                    int seq_length,
+                                    unsigned seq_offset,
+                                    int seq_length1,
+                                    int hidden_dim,
+                                    int heads,
+                                    int rotary_dim,
+                                    bool rotate_half,
+                                    bool rotate_every_two,
+                                    hipStream_t stream,
+                                    int trans_count,
+                                    int max_out_tokens);
+template <typename T>
+void pad_data(T* padded_output,
+              T* output,
+              int bsz,
+              int head_size,
+              int padded_head_size,
+              hipStream_t stream);
+
+template <typename T>
+void pad_head_seq(T* padded_output,
+                  T* output,
+                  int bsz,
+                  int seq_len,
+                  int padded_seq_len,
+                  int head_size,
+                  int padded_head_size,
+                  hipStream_t stream);
+
+template <typename T>
+void launch_pad_add_transform_0213(T* output,
+                                   const T* vals,
+                                   int batch_size,
+                                   int hidden_dim,
+                                   int seq_length,
+                                   int padded_seq_len,
+                                   int heads,
+                                   int padded_head_size,
+                                   hipStream_t stream);
diff --git a/csrc/transformer/normalize_kernels.cu b/csrc/transformer/normalize_kernels.cu
index d634c7f1b2cd1c2632495d8e1f3b47b45867c353..84f27b2c43af9eeb44b2fa353411d62f74e6830b 100644
--- a/csrc/transformer/normalize_kernels.cu
+++ b/csrc/transformer/normalize_kernels.cu
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #include "custom_cuda_layers.h"
 
 namespace cg = cooperative_groups;
@@ -862,6 +866,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
                                    bool invertible,
                                    int row_stride)
 {
+#ifdef HALF_PRECISION_AVAILABLE
     int iteration_stride = blockDim.x;
     int iterations = row_stride / iteration_stride;
 
@@ -985,6 +990,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
 
         inp_grad_h[high_index] = temp;
     }
+#endif
 }
 
 template <>
@@ -1172,6 +1178,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
                                    __half* inp_grad,
                                    int row_stride)
 {
+#ifdef HALF_PRECISION_AVAILABLE
     int iteration_stride = blockDim.x;
     int iterations = row_stride / iteration_stride;
 
@@ -1290,6 +1297,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
         __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
         inp_grad_h[high_index] = temp;
     }
+#endif
 }
 
 template <>
@@ -1601,6 +1609,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
                                              bool invertible,
                                              int row_stride)
 {
+#ifdef HALF_PRECISION_AVAILABLE
     int iteration_stride = blockDim.x;
     int iterations = row_stride / iteration_stride;
 
@@ -1727,6 +1736,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
 
         inp_grad_h[high_index] = temp + out_grad_h2[high_index];
     }
+#endif
 }
 
 template <>
@@ -1922,6 +1932,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
                                              __half* inp_grad,
                                              int row_stride)
 {
+#ifdef HALF_PRECISION_AVAILABLE
     int iteration_stride = blockDim.x;
     int iterations = row_stride / iteration_stride;
 
@@ -2044,6 +2055,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
         __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
         inp_grad_h[high_index] = temp + out_grad_h2[high_index];
     }
+#endif
 }
 
 template <>
diff --git a/csrc/transformer/softmax_kernels.cu b/csrc/transformer/softmax_kernels.cu
index 34487cba9bbec060baa0fb79922e691076296476..bad814b8629d236753e134731e461cfaca1d88ff 100644
--- a/csrc/transformer/softmax_kernels.cu
+++ b/csrc/transformer/softmax_kernels.cu
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #include <math.h>
 #include "custom_cuda_layers.h"
 #include "general_kernels.h"
@@ -536,6 +540,102 @@ __global__ void softmax_backward_kernel_v2(T* grad /* input & output*/,
     }
 }
 
+__global__ void softmax_backward_kernel_arbitrary_length(__half* grad /* input & output*/,
+                                                         const __half* output,
+                                                         int softmax_length)
+{
+    int batch_idx = blockIdx.x * blockDim.y + threadIdx.y;
+    int offset = batch_idx * softmax_length + threadIdx.x;
+
+    const float4* output_cast = reinterpret_cast<const float4*>(output);
+    float4* grad_cast = reinterpret_cast<float4*>(grad);
+
+    grad_cast += offset;
+    output_cast += offset;
+
+    float sum = 0.0;
+    int curr_idx = threadIdx.x;
+    while (curr_idx < softmax_length) {
+        float4 out_reg = output_cast[curr_idx];
+        float4 grad_reg = grad_cast[curr_idx];
+        __half2* out_h = reinterpret_cast<__half2*>(&out_reg);
+        __half2* grad_h = reinterpret_cast<__half2*>(&grad_reg);
+#pragma unroll
+        for (int m = 0; m < 4; m++) grad_h[m] *= out_h[m];
+        sum += ((float)grad_h[0].x + (float)grad_h[0].y + (float)grad_h[1].x + (float)grad_h[1].y) +
+               ((float)grad_h[2].x + (float)grad_h[2].y + (float)grad_h[3].x + (float)grad_h[3].y);
+        curr_idx += WARP_SIZE;
+    }
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+#pragma unroll
+    for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
+
+    curr_idx = threadIdx.x;
+    while (curr_idx < softmax_length) {
+        float4 out_reg = output_cast[curr_idx];
+        float4 grad_reg = grad_cast[curr_idx];
+        __half* grad_h = reinterpret_cast<__half*>(&grad_reg);
+        __half* out_h = reinterpret_cast<__half*>(&out_reg);
+
+#pragma unroll
+        for (int m = 0; m < 8; m++) grad_h[m] = (float)out_h[m] * ((float)grad_h[m] - sum);
+
+        grad_cast[curr_idx] = grad_reg;
+        curr_idx += WARP_SIZE;
+    }
+}
+
+__global__ void softmax_backward_kernel_arbitrary_length(float* grad /* input & output*/,
+                                                         const float* output,
+                                                         int softmax_length)
+{
+    int batch_idx = blockIdx.x * blockDim.y + threadIdx.y;
+    int offset = batch_idx * softmax_length + threadIdx.x;
+
+    const float4* output_cast = reinterpret_cast<const float4*>(output);
+    float4* grad_cast = reinterpret_cast<float4*>(grad);
+
+    grad_cast += offset;
+    output_cast += offset;
+
+    float sum = 0.0;
+    int curr_idx = threadIdx.x;
+    while (curr_idx < softmax_length) {
+        float4 out_reg = output_cast[curr_idx];
+        float4 grad_reg = grad_cast[curr_idx];
+
+        grad_reg.x *= out_reg.x;
+        grad_reg.y *= out_reg.y;
+        grad_reg.z *= out_reg.z;
+        grad_reg.w *= out_reg.w;
+        sum += (grad_reg.x + grad_reg.y + grad_reg.z + grad_reg.w);
+
+        curr_idx += WARP_SIZE;
+    }
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+#pragma unroll
+    for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
+
+    curr_idx = threadIdx.x;
+    while (curr_idx < softmax_length) {
+        float4 out_reg = output_cast[curr_idx];
+        float4 grad_reg = grad_cast[curr_idx];
+        grad_reg.x = out_reg.x * (grad_reg.x - sum);
+        grad_reg.y = out_reg.y * (grad_reg.y - sum);
+        grad_reg.z = out_reg.z * (grad_reg.z - sum);
+        grad_reg.w = out_reg.w * (grad_reg.w - sum);
+
+        grad_cast[curr_idx] = grad_reg;
+        curr_idx += WARP_SIZE;
+    }
+}
+
 template <typename T>
 void launch_attn_softmax_backward_v2(T* out_grad,
                                      const T* soft_inp,
@@ -575,10 +675,15 @@ void launch_attn_softmax_backward_v2(T* out_grad,
     else if (seq_length <= 2048)
         softmax_backward_kernel_v2<T, 64>
             <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else if (seq_length <= 4096)
+        softmax_backward_kernel_v2<T, 128>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else if (seq_length <= 8192)
+        softmax_backward_kernel_v2<T, 256>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
     else
-        throw std::runtime_error(
-            std::string("Special sequence length found in softmax backward, seq_length: ") +
-            std::to_string(seq_length));
+        softmax_backward_kernel_arbitrary_length<<<grid_dim, block_dim, 0, stream>>>(
+            out_grad, soft_inp, seq_length / (4 << ((sizeof(T) & 2) >> 1)));
 }
 
 template void launch_attn_softmax_backward_v2<__half>(__half* out_grad,
diff --git a/csrc/transformer/transform_kernels.cu b/csrc/transformer/transform_kernels.cu
old mode 100644
new mode 100755
index 15a2219333e43a6da1b93038a406b35d302bb9d9..37a6e79bb54556ffb3b5bb81c881e6ffe44f1438
--- a/csrc/transformer/transform_kernels.cu
+++ b/csrc/transformer/transform_kernels.cu
@@ -1,3 +1,7 @@
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #include "custom_cuda_layers.h"
 
 #define rows_trans 16
diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py
old mode 100644
new mode 100755
index 2292b4195c070ef1d192541a6a60157238dfba0d..9e2e25513a723a0133ac45e1468d55e567a33872
--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
@@ -4,6 +4,7 @@ Copyright 2020 The Microsoft DeepSpeed Team
 
 import sys
 import types
+import json
 from typing import Optional, Union
 import torch
 from torch.optim import Optimizer
@@ -17,17 +18,18 @@ from .runtime.engine import DeepSpeedEngine, DeepSpeedOptimizerCallable, DeepSpe
 from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER
 from .runtime.pipe.engine import PipelineEngine
 from .inference.engine import InferenceEngine
-
+from .inference.config import DeepSpeedInferenceConfig
 from .runtime.lr_schedules import add_tuning_arguments
 from .runtime.config import DeepSpeedConfig, DeepSpeedConfigError
 from .runtime.activation_checkpointing import checkpointing
 from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 from .module_inject import replace_transformer_layer, revert_transformer_layer
 
-from .utils import log_dist
-from .utils.distributed import init_distributed
+from .utils import log_dist, OnDevice
+from .comm.comm import init_distributed
 
 from .runtime import zero
+from .runtime import DeepSpeedOptimizer, ZeROOptimizer
 
 from .pipe import PipelineModule
 
@@ -82,7 +84,7 @@ def initialize(args=None,
         mpu: Optional: A model parallelism unit object that implements
             get_{model,data}_parallel_{rank,group,world_size}()
 
-        dist_init_required: Optional: None will auto-initialize torch.distributed if needed,
+        dist_init_required: Optional: None will auto-initialize torch distributed if needed,
             otherwise the user can force it to be initialized or not via boolean.
 
         collate_fn: Optional: Merges a list of samples to form a
@@ -113,6 +115,10 @@ def initialize(args=None,
         __git_hash__,
         __git_branch__),
              ranks=[0])
+
+    # Disable zero.Init context if it's currently enabled
+    zero.partition_parameters.shutdown_init_context()
+
     assert model is not None, "deepspeed.initialize requires a model"
 
     if not isinstance(model, PipelineModule):
@@ -217,61 +223,57 @@ def add_config_arguments(parser):
     return parser
 
 
-def init_inference(model,
-                   triangular_masking=True,
-                   mp_size=1,
-                   training_mp_size=1,
-                   mpu=None,
-                   ep_group=None,
-                   expert_mp_group=None,
-                   checkpoint=None,
-                   dtype=None,
-                   injection_policy=None,
-                   replace_method='auto',
-                   quantization_setting=None,
-                   replace_with_kernel_inject=False,
-                   return_tuple=True,
-                   ep_size=1,
-                   moe=False,
-                   moe_experts=1,
-                   moe_type='standard',
-                   args=None):
+def default_inference_config():
+    """
+        Return a default DeepSpeed inference configuration dictionary.
+    """
+    return DeepSpeedInferenceConfig().dict()
+
+
+def init_inference(model, config=None, **kwargs):
     """Initialize the DeepSpeed InferenceEngine.
 
-    Arguments:
-        model: Required: nn.module class before apply any wrappers
+    Description: all four cases are valid and supported in DS init_inference() API.
 
-        triangular_masking: Required: this shows the type of masking for attention scores in transformer layer
-            note that the masking is application specific.
+    # Case 1: user provides no config and no kwargs. Default config will be used.
 
-        mp_size: Optional: Desired model parallel size, default is 1 meaning no
-            model parallelism.
+    .. code-block:: python
 
-        training_mp_size: Optional: if loading a checkpoint this is the mp size that it was trained with,
-            it may be different than what the mp size that you want to use during inference.
+        generator.model = deepspeed.init_inference(generator.model)
+        string = generator("DeepSpeed is")
+        print(string)
 
-        mpu: Optional: A model parallelism unit object that implements
-            get_{model,data}_parallel_{rank,group,world_size}()
+    # Case 2: user provides a config and no kwargs. User supplied config will be used.
+
+    .. code-block:: python
+
+        generator.model = deepspeed.init_inference(generator.model, config=config)
+        string = generator("DeepSpeed is")
+        print(string)
+
+    # Case 3: user provides no config and uses keyword arguments (kwargs) only.
 
-        checkpoint: Optional: Path to deepspeed compatible checkpoint or path to
-            JSON with load policy.
+    .. code-block:: python
 
-        dtype: Optional: Desired model data type, will convert model to this type.
-            Supported target types: torch.half, torch.int8, torch.float
+        generator.model = deepspeed.init_inference(generator.model,
+                                                    mp_size=world_size,
+                                                    dtype=torch.half,
+                                                    replace_with_kernel_inject=True)
+        string = generator("DeepSpeed is")
+        print(string)
 
-        injection_policy: Optional: Dictionary mapping a client nn.Module to its corresponding
-            injection policy. e.g., {BertLayer : deepspeed.inference.HFBertLayerPolicy}
+    # Case 4: user provides config and keyword arguments (kwargs). Both config and kwargs are merged and kwargs take precedence.
 
-        replace_method: Optional: If 'auto' DeepSpeed will automatically try and replace
-            model modules with its optimized versions. If an injection_policy is set this will
-            override the automatic replacement behavior.
+    .. code-block:: python
 
-        quantization_setting: Optional: Quantization settings used for quantizing your model using the MoQ.
-            The setting can be one element or a tuple. If one value is passed in, we consider it as the number
-            of groups used in quantization. A tuple is passed in if we want to mention that there is extra-grouping
-            for the MLP part of a Transformer layer (e.g. (True, 8) shows we quantize the model using 8 groups for
-            all the network except the MLP part that we use 8 extra grouping).
-        replace_with_kernel_inject: If set we inject kernel as we initialize the inference-engine
+        generator.model = deepspeed.init_inference(generator.model, config={"dtype": torch.half}, replace_with_kernel_inject=True)
+        string = generator("DeepSpeed is")
+        print(string)
+
+    Arguments:
+        model: Required: original nn.module object without any wrappers
+
+        config: Optional: instead of arguments, you can pass in a DS inference config dict or path to JSON file
 
     Returns:
         A deepspeed.InferenceEngine wrapped model.
@@ -282,24 +284,30 @@ def init_inference(model,
         __git_branch__),
              ranks=[0])
 
-    engine = InferenceEngine(model,
-                             triangular_masking,
-                             mp_size,
-                             training_mp_size,
-                             ep_size,
-                             mpu,
-                             ep_group,
-                             expert_mp_group,
-                             checkpoint,
-                             dtype,
-                             injection_policy,
-                             return_tuple,
-                             replace_method,
-                             quantization_setting,
-                             replace_with_kernel_inject,
-                             moe,
-                             moe_experts,
-                             moe_type,
-                             args)
+    # Load config_dict from config first
+    if config is None:
+        config = {}
+    if isinstance(config, str):
+        with open(config, "r") as f:
+            config_dict = json.load(f)
+    elif isinstance(config, dict):
+        config_dict = config
+    else:
+        raise ValueError(
+            f"'config' argument expected string or dictionary, got {type(config)}")
+
+    # Update with values from kwargs, ensuring no conflicting overlap between config and kwargs
+    overlap_keys = set(config_dict.keys()).intersection(kwargs.keys())
+    # If there is overlap, error out if values are different
+    for key in overlap_keys:
+        if config_dict[key] != kwargs[key]:
+            raise ValueError(
+                f"Conflicting argument '{key}' in 'config':{config_dict[key]} and kwargs:{kwargs[key]}"
+            )
+    config_dict.update(kwargs)
+
+    ds_inference_config = DeepSpeedInferenceConfig(**config_dict)
+
+    engine = InferenceEngine(model, config=ds_inference_config)
 
     return engine
diff --git a/deepspeed/accelerator b/deepspeed/accelerator
new file mode 120000
index 0000000000000000000000000000000000000000..b61ffface2d00db74498ac502e3eb559afc23301
--- /dev/null
+++ b/deepspeed/accelerator
@@ -0,0 +1 @@
+../accelerator/
\ No newline at end of file
diff --git a/deepspeed/autotuning/README.md b/deepspeed/autotuning/README.md
old mode 100644
new mode 100755
diff --git a/deepspeed/autotuning/__init__.py b/deepspeed/autotuning/__init__.py
index deea832eb6daf1c1b838f8d01a89a3209e4220c9..11ad4700017aea6ee82f65d5e70d490b5e60f81f 100644
--- a/deepspeed/autotuning/__init__.py
+++ b/deepspeed/autotuning/__init__.py
@@ -1 +1,3 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from .autotuner import Autotuner
diff --git a/deepspeed/autotuning/autotuner.py b/deepspeed/autotuning/autotuner.py
old mode 100644
new mode 100755
index 4ff85e6d9717767ed24d3387edec8ff1d37f961c..569b62666fc0986a78354e5a50184a7a71dc2ed6
--- a/deepspeed/autotuning/autotuner.py
+++ b/deepspeed/autotuning/autotuner.py
@@ -1,30 +1,40 @@
-import copy
-import json
-import os
-from random import sample
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import shutil
 import subprocess
-import hjson
-import torch
 import time
 import datetime
 import math
+import hjson
 
 from ..runtime.config_utils import dict_raise_error_on_duplicate_keys
 from ..runtime.constants import *
-from ..runtime.zero.constants import *
+
+from ..runtime.zero.config import ZERO_OPTIMIZATION, ZeroStageEnum
 from ..utils import logger
 from .config import DeepSpeedAutotuningConfig
 from .constants import *
-from .scheduler import ResourceManager, run_experiment
+from .scheduler import ResourceManager
 from .tuner import GridSearchTuner, RandomTuner, ModelBasedTuner
 from .utils import *
+from deepspeed.accelerator import get_accelerator
 
 try:
     from tabulate import tabulate
 except ImportError:
     tabulate = None
 
+try:
+    import mlflow
+    has_mlflow = True
+except Exception as e:
+    has_mlflow = False
+
+ZERO_OPTIMIZATION_STAGE = "stage"
+OFFLOAD_OPTIMIZER = "offload_optimizer"
+OFFLOAD_PARAM = "offload_param"
+ZERO_OPTIMIZATION_STAGE_DEFAULT = ZeroStageEnum.disabled
+
 
 class Autotuner:
     """The DeepSpeed Autotuner automatically discovers the optimal DeepSpeed configuration that delivers good training speed. The Autotuner uses model information, system information, and heuristics to efficiently tune system knobs that affect compute and memory efficiencies, such as ZeRO optimization stages, micro-batch sizes, and many other ZeRO optimization configurations. It not only reduces the time and resources user spend on tuning, but also can discover configurations better than hand-tuned methods.
@@ -42,22 +52,37 @@ class Autotuner:
         assert self.user_config is not None, "DeepSpeed configuration is not provided"
 
         self.autotuning_config = DeepSpeedAutotuningConfig(self.user_config)
+        if self.user_config[AUTOTUNING]:
+            if AUTOTUNING_EXPS_DIR in self.user_config[AUTOTUNING].keys():
+                del self.user_config[AUTOTUNING][AUTOTUNING_EXPS_DIR]
+            if AUTOTUNING_RESULTS_DIR in self.user_config[AUTOTUNING].keys():
+                del self.user_config[AUTOTUNING][AUTOTUNING_RESULTS_DIR]
 
-        self.exps_dir = DEFAULT_EXPRS_DIR
-        if self.autotuning_config.exps_dir and self.autotuning_config.exps_dir != "":
-            self.exps_dir = self.autotuning_config.exps_dir
+        self.exps_dir = self.autotuning_config.exps_dir
         if self.autotuning_config.overwrite and os.path.exists(self.exps_dir):
             shutil.rmtree(self.exps_dir, ignore_errors=True)
         if not os.path.exists(self.exps_dir):
-            os.makedirs(self.exps_dir, exist_ok=True)
+            try:
+                os.makedirs(self.exps_dir, exist_ok=True)
+                logger.info(f"Created autotuning experiments directory: {self.exps_dir}")
+            except:
+                logger.error(
+                    f"Failed to create {self.exps_dir}, please check `exps_dir` in the autotuning config file is accessible by all the nodes in the job."
+                )
+                exit(-1)
 
-        self.results_dir = DEFAULT_RESULTS_DIR
-        if self.autotuning_config.results_dir and self.autotuning_config.results_dir != "":
-            self.results_dir = self.autotuning_config.results_dir
+        self.results_dir = self.autotuning_config.results_dir
         if self.autotuning_config.overwrite and os.path.exists(self.results_dir):
             shutil.rmtree(self.results_dir, ignore_errors=True)
         if not os.path.exists(self.results_dir):
-            os.makedirs(self.results_dir, exist_ok=True)
+            try:
+                os.makedirs(self.results_dir, exist_ok=True)
+                logger.info(f"Created autotuning resutls directory: {self.exps_dir}")
+            except:
+                logger.error(
+                    f"Failed to create {self.results_dir}, please check `results_dir` in the autotuning config file is accessible by all the nodes in the job."
+                )
+                exit(-1)
 
         # set the active resource for the autotuner resource manager
         self.rm = self._get_resource_manager(active_resources)
@@ -70,6 +95,10 @@ class Autotuner:
             self.rm.nodes), "num_nodes in the autotuning configuration must not be less than the --num_nodes value in the train script if any"
 
         self.records = {}
+        self.optimal_cmd = None
+        self.optmal_ds_config = None
+
+        self.mlflow_parent_id = None
 
     def print_tuning_results(self):
         """Print the autotuning results in tabular format.
@@ -252,7 +281,7 @@ class Autotuner:
             return False
 
     def get_gpu_memory_info(self):
-        return torch.cuda.get_device_properties(0).total_memory
+        return get_accelerator().total_memory()
 
     def get_activation_memory_per_gpu(self):
         if self.model_info and "activation_mem_per_gpu" in self.model_info:
@@ -266,18 +295,18 @@ class Autotuner:
         if not num_params:
             return 0
         # assume the model uses Adam optimizer
-        # ZERO_OPTIMIZATION_DISABLED:
+        # ZeroStageEnum.disabled:
         params_mem = num_params * (2 if fp16_enabled else 4)
         gradients_mem = num_params * (2 if fp16_enabled else 4)
         optimizer_mem = num_params * (16 if fp16_enabled else 8)
 
-        if zero_stage >= ZERO_OPTIMIZATION_OPTIMIZER_STATES:
+        if zero_stage >= ZeroStageEnum.optimizer_states:
             optimizer_mem = optimizer_mem / total_gpus
 
-        if zero_stage >= ZERO_OPTIMIZATION_GRADIENTS:
+        if zero_stage >= ZeroStageEnum.gradients:
             gradients_mem = gradients_mem / total_gpus
 
-        if zero_stage >= ZERO_OPTIMIZATION_WEIGHTS:
+        if zero_stage >= ZeroStageEnum.weights:
             params_mem = params_mem / total_gpus
 
         mem_per_gpu = (params_mem + gradients_mem + optimizer_mem) / self.mp_size()
@@ -308,7 +337,7 @@ class Autotuner:
 
         # each zero stage uses a different template configuration file
         config_zero = tuning_space.get(ZERO_OPTIMIZATION, {})
-        stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, None)
+        stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, ZERO_OPTIMIZATION_STAGE_DEFAULT)
         template_config = {}
         if stage == 0:
             template_path = DEFAULT_TEMPLATE_PATH_ZERO_0
@@ -331,12 +360,11 @@ class Autotuner:
             model_info = self.model_info
             if model_info and "hidden_size" in model_info:
                 hs = model_info["hidden_size"]
+                template_config[ZERO_OPTIMIZATION]['reduce_bucket_size'] = hs * hs
                 template_config[ZERO_OPTIMIZATION][
-                    ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE] = hs * hs
-                template_config[ZERO_OPTIMIZATION][
-                    ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE] = 0.9 * hs * hs
+                    'stage3_prefetch_bucket_size'] = 0.9 * hs * hs
                 template_config[ZERO_OPTIMIZATION][
-                    ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD] = 10 * hs
+                    'stage3_param_persistence_threshold'] = 10 * hs
             prefix = "z3_"
         else:
             return exps
@@ -355,11 +383,11 @@ class Autotuner:
 
         logger.debug(f"tuning_keys = {tuning_keys}")
 
-        logger.debug(f"before prunning total configs = {len(all_configs)}")
+        logger.debug(f"before pruning total configs = {len(all_configs)}")
 
         pruned_list = prune_configs(all_configs)
 
-        logger.debug(f"after prunning total configs = {len(pruned_list)}")
+        logger.debug(f"after pruning total configs = {len(pruned_list)}")
 
         for config in pruned_list:
             exp_config = copy.deepcopy(template_config)
@@ -375,7 +403,6 @@ class Autotuner:
                 if OFFLOAD_PARAM not in config_zero and OFFLOAD_PARAM in exp_config[
                         ZERO_OPTIMIZATION]:
                     del exp_config[ZERO_OPTIMIZATION][OFFLOAD_PARAM]
-
             # set gradient accumulation steps according to max_train_batch_size_per_gpu
             mbs = exp_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU]
             gas = max_train_batch_size_per_gpu // mbs
@@ -396,6 +423,10 @@ class Autotuner:
     def tune(self):
         """ Tunes Zero stages, micro batch size per GPU, and other Zero configurations. Performance metrics of different tuning spaces are recorded in self.records.
         """
+        if has_mlflow:
+            self.mlflow_parent_id = os.environ['MLFLOW_RUN_ID']
+            mlflow.start_run(run_id=self.mlflow_parent_id)
+
         self.start_time = time.time()
         if self.fast_enabled():
             logger.info(f"Fast mode is enabled. Tuning micro batch size only.")
@@ -420,9 +451,11 @@ class Autotuner:
             f"The model requires at least {memory_to_string(self.activation_mem, postfix='B')} activation memory for micro batch size 1."
         )
 
+        #TODO: FIX THIS
         stage = self.user_config.get(ZERO_OPTIMIZATION,
                                      {}).get(ZERO_OPTIMIZATION_STAGE,
                                              "all")
+        stage = "all"
         user_zero_stages = [stage] if not isinstance(stage, list) else stage
         logger.info(f"User-defined zero stages are {stage}.")
 
@@ -431,9 +464,9 @@ class Autotuner:
         metric_val = 0
 
         required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
-            ZERO_OPTIMIZATION_DISABLED) + self.activation_mem
+            ZeroStageEnum.disabled) + self.activation_mem
         if self.gpu_mem > required_gpu_mem:
-            if "all" in user_zero_stages or ZERO_OPTIMIZATION_DISABLED in user_zero_stages:
+            if "all" in user_zero_stages or ZeroStageEnum.disabled in user_zero_stages:
                 logger.info(
                     f"The model might be runable with ZERO 0 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1), adding DEFAULT_TUNING_SPACE_ZERO_0 to the global tuning space"
                 )
@@ -443,15 +476,17 @@ class Autotuner:
                     mbs = next_mbs
                     max_mbs = next_max_mbs
                     metric_val = next_metric_val
+                if has_mlflow:
+                    mlflow.log_metric(f"z0{self.metric()}", next_metric_val)
         else:
             logger.info(
-                f"The model is not runable with ZERO stage {ZERO_OPTIMIZATION_DISABLED} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
+                f"The model is not runable with ZERO stage {ZeroStageEnum.disabled} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
             )
 
         required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
-            ZERO_OPTIMIZATION_OPTIMIZER_STATES) + self.activation_mem
+            ZeroStageEnum.optimizer_states) + self.activation_mem
         if self.gpu_mem > required_gpu_mem:
-            if "all" in user_zero_stages or ZERO_OPTIMIZATION_OPTIMIZER_STATES in user_zero_stages:
+            if "all" in user_zero_stages or ZeroStageEnum.optimizer_states in user_zero_stages:
                 logger.info(
                     f"The model might be runable with ZERO 1 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_1 to the global tuning space"
                 )
@@ -461,15 +496,17 @@ class Autotuner:
                     mbs = next_mbs
                     max_mbs = next_max_mbs
                     metric_val = next_metric_val
+                if has_mlflow:
+                    mlflow.log_metric(f"z1{self.metric()}", next_metric_val)
         else:
             logger.info(
-                f"The model is not runable with ZERO stage {ZERO_OPTIMIZATION_OPTIMIZER_STATES} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
+                f"The model is not runable with ZERO stage {ZeroStageEnum.optimizer_states} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
             )
 
         required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
-            ZERO_OPTIMIZATION_GRADIENTS) + self.activation_mem
+            ZeroStageEnum.gradients) + self.activation_mem
         if self.gpu_mem > required_gpu_mem:
-            if "all" in user_zero_stages or ZERO_OPTIMIZATION_GRADIENTS in user_zero_stages:
+            if "all" in user_zero_stages or ZeroStageEnum.gradients in user_zero_stages:
                 logger.info(
                     f"The model might be runable with ZERO 2 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_2 to the global tuning space"
                 )
@@ -479,25 +516,31 @@ class Autotuner:
                     mbs = next_mbs
                     max_mbs = next_max_mbs
                     metric_val = next_metric_val
+                if has_mlflow:
+                    mlflow.log_metric(f"z2{self.metric()}", next_metric_val)
         else:
             logger.info(
-                f"The model is not runable with ZERO stage {ZERO_OPTIMIZATION_GRADIENTS} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
+                f"The model is not runable with ZERO stage {ZeroStageEnum.gradients} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
             )
 
         required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
-            ZERO_OPTIMIZATION_WEIGHTS) + self.activation_mem
+            ZeroStageEnum.weights) + self.activation_mem
         if self.gpu_mem > required_gpu_mem:
-            if "all" in user_zero_stages or ZERO_OPTIMIZATION_WEIGHTS in user_zero_stages:
+            if "all" in user_zero_stages or ZeroStageEnum.weights in user_zero_stages:
                 logger.info(
                     f"The model might be runable with ZERO 3 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_3 to the global tuning space"
                 )
-                _, _, _ = self.tune_space(
+                _, _, next_metric_val = self.tune_space(
                     DEFAULT_TUNING_SPACE_ZERO_3, prev_max_mbs = max_mbs, prev_best_mbs=mbs, prev_best_metric_val=metric_val)
+                if has_mlflow:
+                    mlflow.log_metric(f"z3{self.metric()}", next_metric_val)
         else:
             logger.info(
-                f"The model has {self.get_model_num_params()} parameters and requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory per GPU with DeepSpeed Zero stage {ZERO_OPTIMIZATION_WEIGHTS} optimization. Memory per GPU in system is {memory_to_string(self.gpu_mem)}. No tuning is performed."
+                f"The model has {self.get_model_num_params()} parameters and requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory per GPU with DeepSpeed Zero stage {ZeroStageEnum.weights} optimization. Memory per GPU in system is {memory_to_string(self.gpu_mem)}. No tuning is performed."
             )
             return
+        if has_mlflow:
+            mlflow.end_run()
 
     def tune_space(self,
                    tuning_space,
@@ -505,7 +548,7 @@ class Autotuner:
                    prev_best_mbs=0,
                    prev_best_metric_val=0):
         config_zero = tuning_space.get(ZERO_OPTIMIZATION, {})
-        stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, ZERO_OPTIMIZATION_STAGE_DEFAULT)
+        stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, None)
         tuning_space_name = TUNING_MICRO_BATCH_SIZE_PREFIX + str(stage)
         tuning_micro_batch_sizes = []
         max_train_batch_size_per_gpu = 0
@@ -785,11 +828,12 @@ class Autotuner:
 
         self.rm.schedule_experiments(exp_paths)
         self.rm.run()
+
         for exp_id, (exp, err) in self.rm.finished_experiments.items():
             if exp:
                 metric_file = exp[DS_CONFIG][AUTOTUNING][AUTOTUNING_METRIC_PATH]
-
                 if os.path.exists(metric_file):
+
                     with open(metric_file, 'r') as f:
                         results = hjson.load(f)
                         metric_val = results[self.metric()]
@@ -797,11 +841,19 @@ class Autotuner:
                         if max_micro_batch_size == exp[DS_CONFIG][
                                 TRAIN_MICRO_BATCH_SIZE_PER_GPU]:
                             max_micro_batch_size_metric_val = metric_val
+                        if has_mlflow:
+                            os.environ.pop('MLFLOW_RUN_ID')
+                            mlflow.start_run(nested=True, run_name=exp['name'])
+                            for metric in results:
+                                mlflow.log_metric(metric, results[metric])
+                            mlflow.end_run()
+                            os.environ['MLFLOW_RUN_ID'] = self.mlflow_parent_id
                 else:
                     self.update_records(tuning_space_name, exp, 0, 1)
             else:
                 mbs = exp[DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU]
                 logger.info(f"micro batch size = {mbs} was not run successfully")
+
         self.rm.clear()
 
         if tuning_micro_batch_sizes_overwritten:
@@ -831,7 +883,18 @@ class Autotuner:
                 self.exp_num_gpus * self.exp_num_nodes // self.mp_size()
             exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(mbs)
             exp, metric_val = self.run_ds_config(ds_config, exp_name)
+
             if metric_val:
+                with open(metric_file, 'r') as f:
+                    results = hjson.load(f)
+                    metric_val = results[self.metric()]
+                    if has_mlflow:
+                        os.environ.pop('MLFLOW_RUN_ID')
+                        mlflow.start_run(nested=True, run_name=exp_name)
+                        for metric in results:
+                            mlflow.log_metric(metric, results[metric])
+                        mlflow.end_run()
+                        os.environ['MLFLOW_RUN_ID'] = self.mlflow_parent_id
                 self.update_records(tuning_space_name, exp, metric_val, 1)
                 if metric_val > prev_best_metric_val * (1 + METRIC_PERCENT_DIFF_CONST):
                     prev_best_metric_val = metric_val
@@ -843,7 +906,6 @@ class Autotuner:
                 break
         if prev_best_mbs != max_micro_batch_size:
             tuning_micro_batch_sizes[-1] = prev_best_mbs
-
         return tuning_micro_batch_sizes
 
     def get_min_max_micro_batch_size(self,
@@ -961,11 +1023,10 @@ class Autotuner:
 
         low = min_micro_batch_size
         high = max_micro_batch_size
-        while low < high:
+        # binary search until low is the smallest micro batch size that OOMs.
+        while low <= high:
             mid = int((low + high) // 2)
             logger.debug(f"trying mbs = {mid}, low = {low}, high = {high}")
-            if mid == low:
-                break
             if mid not in used_micro_batch_sizes:
                 ds_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = mid
                 ds_config[TRAIN_BATCH_SIZE] = mid * gas * \
@@ -973,7 +1034,7 @@ class Autotuner:
                 exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(mid)
                 exp, metric_val = self.run_ds_config(ds_config, exp_name)
                 if metric_val:
-                    low = mid
+                    low = mid + 1
                     self.update_records(tuning_space_name, exp, metric_val, 1)
                     used_micro_batch_sizes.append(mid)
                     if prev_metric_val and ((metric_val - prev_metric_val) /
@@ -985,8 +1046,8 @@ class Autotuner:
                     self.update_records(tuning_space_name, exp, 0, 1)
                     high = mid - 1
             else:
-                low = mid
-        max_micro_batch_size = low
+                low = mid + 1
+        max_micro_batch_size = low - 1
 
         logger.info(
             f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}."
@@ -1084,26 +1145,18 @@ class Autotuner:
             json.dump(exp_config, fd)
             fd.flush()
             os.fsync(fd)
-
         self.rm.schedule_experiments([exp_path])
         self.rm.run()
         exp, metric_val = self.rm.parse_results(self.metric())
         self.rm.clear()
         return exp, metric_val
 
-    def run_after_tuning(self):
-        """ Launches the training with the optmimal DeepSpeed configuration found through the autotuning process.
-            "ds_config_optimal.json" describing the optmimal DeepSpeed configuration as well the command used to launch training "cmd_optimal.txt" are saved to self.results_dir.
-        """
+    def write_optimal_config(self):
         best_space_records = self.get_best_space_records()
         if GLOBAL_TUNING_SPACE not in best_space_records:
             return
         best_exp, best_metric_val, _ = best_space_records[GLOBAL_TUNING_SPACE]
         if best_exp:
-            logger.info(
-                "Start training with the optmimal DeepSpeed configuration found through the tuning process"
-            )
-
             exp_dir = best_exp["result_dir"]
             cmd = None
             with open(os.path.join(exp_dir, "cmd.txt"), "r") as f:
@@ -1115,18 +1168,27 @@ class Autotuner:
             ds_config_path = os.path.join(self.results_dir, "ds_config_optimal.json")
             json.dump(ds_config, open(ds_config_path, "w"))
 
-            idx = cmd.index(os.path.join(exp_dir, "ds_config.json"))
-            cmd[idx] = ds_config_path
-
             cmd_path = os.path.join(self.results_dir, "cmd_optimal.txt")
             with open(cmd_path, "w") as fd:
                 fd.write(" ".join(cmd))
                 fd.write("\n")
                 fd.flush()
+            self.optimal_cmd = cmd
+            self.optmal_ds_config = ds_config
+            logger.info(
+                f"Wrote the optimal DeepSpeed configuration found by autotuning to {ds_config_path}, and the corresponding DeepSpeed command to {cmd_path}"
+            )
 
-            result = subprocess.Popen(cmd)
+    def run_after_tuning(self):
+        """ Launches the training with the optimal DeepSpeed configuration found through the autotuning process.
+            "ds_config_optimal.json" describing the optmimal DeepSpeed configuration as well the command used to launch training "cmd_optimal.txt" are saved to self.results_dir.
+        """
+        if self.optimal_cmd:
+            result = subprocess.Popen(self.optimal_cmd)
             result.wait()
 
             logger.info(
-                f"Done running with the optimal DeepSpeed configuration found by autotuning: {ds_config_path}"
+                f"Done running with the optimal DeepSpeed configuration using {self.optimal_cmd}"
             )
+        else:
+            logger.info(f"No optimal DeepSpeed configuration found by autotuning.")
diff --git a/deepspeed/autotuning/config.py b/deepspeed/autotuning/config.py
index dea36f0377dcb6ac0eba3fcf9211f95fbde6ba55..6f6b6903efc5a2f06030276979b680fd289e3f1c 100644
--- a/deepspeed/autotuning/config.py
+++ b/deepspeed/autotuning/config.py
@@ -1,3 +1,4 @@
+'''Copyright The Microsoft DeepSpeed Team'''
 """
 Copyright (c) Microsoft Corporation
 Licensed under the MIT license.
@@ -41,11 +42,11 @@ class DeepSpeedAutotuningConfig(DeepSpeedConfigObject):
         self.results_dir = get_scalar_param(autotuning_dict,
                                             AUTOTUNING_RESULTS_DIR,
                                             AUTOTUNING_RESULTS_DIR_DEFAULT)
-
+        assert self.results_dir, "results_dir cannot be empty"
         self.exps_dir = get_scalar_param(autotuning_dict,
                                          AUTOTUNING_EXPS_DIR,
                                          AUTOTUNING_EXPS_DIR_DEFAULT)
-
+        assert self.exps_dir, "exps_dir cannot be empty"
         self.overwrite = get_scalar_param(autotuning_dict,
                                           AUTOTUNING_OVERWRITE,
                                           AUTOTUNING_OVERWRITE_DEFAULT)
diff --git a/deepspeed/autotuning/constants.py b/deepspeed/autotuning/constants.py
index 3bfcd2725f90b41813546411346264927bf28b95..d0306bb09bb91e45e28da008e8cbb9ba50532a3b 100644
--- a/deepspeed/autotuning/constants.py
+++ b/deepspeed/autotuning/constants.py
@@ -1,3 +1,4 @@
+'''Copyright The Microsoft DeepSpeed Team'''
 """
 Copyright (c) Microsoft Corporation
 Licensed under the MIT license.
@@ -22,9 +23,6 @@ DEFAULT_TEMPLATE_PATH_ZERO_3 = os.path.join(os.path.dirname(os.path.realpath(__f
                                             "config_templates",
                                             "template_zero3.json")
 
-DEFAULT_EXPRS_DIR = os.path.join(os.getcwd(), "autotuning_exps")
-DEFAULT_RESULTS_DIR = os.path.join(os.getcwd(), "autotuning_results")
-
 METRIC_PERCENT_DIFF_CONST = 0.05
 DS_CONFIG = "ds_config"
 BUFSIZE = 1  # line buffer size for writing files
@@ -54,10 +52,10 @@ AUTOTUNING_FAST = "fast"
 AUTOTUNING_FAST_DEFAULT = True
 
 AUTOTUNING_RESULTS_DIR = "results_dir"
-AUTOTUNING_RESULTS_DIR_DEFAULT = None
+AUTOTUNING_RESULTS_DIR_DEFAULT = "autotuning_results"
 
 AUTOTUNING_EXPS_DIR = "exps_dir"
-AUTOTUNING_EXPS_DIR_DEFAULT = None
+AUTOTUNING_EXPS_DIR_DEFAULT = "autotuning_exps"
 
 AUTOTUNING_OVERWRITE = "overwrite"
 AUTOTUNING_OVERWRITE_DEFAULT = True
diff --git a/deepspeed/autotuning/scheduler.py b/deepspeed/autotuning/scheduler.py
old mode 100644
new mode 100755
index e4090b38e4c6464c70ebab8bd0504d8b911e1687..2a4c0c70d95552c54844d00e763eeff338953087
--- a/deepspeed/autotuning/scheduler.py
+++ b/deepspeed/autotuning/scheduler.py
@@ -1,23 +1,20 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import copy
-from re import I
 
 from numpy import BUFSIZE
-from deepspeed.env_report import SUCCESS
-from enum import Flag
 import json
-import os
 import subprocess
 import sys
 import threading
 import time
-from pathlib import Path
-from typing import List
+import base64
 
+import os
 import hjson
 from tqdm import tqdm
 
 from ..utils import logger
-from .constants import *
 from .constants import AUTOTUNING, AUTOTUNING_METRIC_PATH
 from .utils import get_val_by_key, search_error, was_interruptted
 """
@@ -25,9 +22,7 @@ thread-0: loop over experiment queue dispatching experiments if they become avai
 thread-N: start each experiment in its own thread
 """
 
-import torch.distributed as dist
-
-from datetime import datetime
+from deepspeed import comm as dist
 
 TIMEOUT = 5
 
@@ -188,7 +183,6 @@ class ResourceManager:
                 logger.debug(f'Put exp_id = {exp["exp_id"]} back into the queue')
                 self.experiment_check(pbar)
             else:
-
                 desc = ""
                 for reservation in reservations:
                     reservation.slots.sort()
@@ -344,19 +338,27 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
     exp["job_id"] = get_job_id()
     exp_dir = exp["result_dir"]
     os.makedirs(exp_dir, exist_ok=True)
-
-    exp["ds_config_path"] = os.path.join(exp_dir, "ds_config.json")
+    ds_config_path = os.path.join(exp_dir, "ds_config.json")
+    exp["ds_config_path"] = ds_config_path
 
     ds_config = copy.deepcopy(exp["ds_config"])
+    ds_config_json = json.dumps(ds_config).encode('utf-8')
+
+    exp["ds_config_base64"] = base64.urlsafe_b64encode(ds_config_json).decode('utf-8')
 
     with open(exp["ds_config_path"], "w", buffering=BUFSIZE) as fd:
         json.dump(ds_config, fd)
         fd.flush()
         os.fsync(fd)
+        path = exp["ds_config_path"]
+        logger.info(f"Scheduler wrote ds_config to {path}, {os.path.abspath(path)}")
+
     with open(os.path.join(exp_dir, "exp.json"), "w", buffering=BUFSIZE) as fd:
         json.dump(exp, fd)
         fd.flush()
         os.fsync(fd)
+        path = os.path.join(exp_dir, "exp.json")
+        logger.info(f"Scheduler wrote exp to {path}, {os.path.abspath(path)}")
 
     # remove "--deepspeed_config ds_config.json" from user_args
     if user_args:
@@ -365,9 +367,10 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
         # "--deepspeed_config" is omitted in HF
         elif "--deepspeed" in user_args:
             idx = user_args.index("--deepspeed")
-        assert idx < len(user_args) and ".json" in user_args[idx +
-                                                             1], "there is no ds_config file specified after --deepspeed_config or --deepspeed"
-        user_args[idx + 1] = exp["ds_config_path"]
+        assert idx < len(user_args), "there is no ds_config file specified after --deepspeed_config or --deepspeed"
+        # user_args[idx + 1] = exp["ds_config_path"]
+        # pass base64 serialized ds_config to launcher
+        user_args[idx + 1] = exp["ds_config_base64"]
 
     exp["user_script"] = user_script
     exp["user_args"] = user_args
@@ -382,7 +385,9 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
         fd.flush()
         os.fsync(fd)
 
-    logger.info(f"Launching exp_id = {exp['exp_id']}, exp_name = {exp['name']}")
+    logger.info(
+        f"Launching exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}, and ds_config = {os.path.abspath(ds_config_path)}"
+    )
 
     with open(os.path.join(exp_dir, "stdout.log"), "wb") as out, open(
         os.path.join(exp_dir, "stderr.log"), "wb"
@@ -396,7 +401,9 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
 
     clean_up(exp, reservations)
 
-    logger.info(f"Done running exp_id = {exp['exp_id']}, exp_name = {exp['name']}")
+    logger.info(
+        f"Done running exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}"
+    )
 
 
 PDSH_MAX_FAN_OUT = 1024
diff --git a/deepspeed/autotuning/tuner/__init__.py b/deepspeed/autotuning/tuner/__init__.py
old mode 100644
new mode 100755
index 7ce9fe4f971282a888a7c86da8145c21565c5da1..9f2e5675d9fd8a124bf81014152ee6bc6fa386f4
--- a/deepspeed/autotuning/tuner/__init__.py
+++ b/deepspeed/autotuning/tuner/__init__.py
@@ -1,3 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from .index_based_tuner import RandomTuner, GridSearchTuner
 # from .ga_tuner import GATuner
 from .model_based_tuner import ModelBasedTuner
diff --git a/deepspeed/autotuning/tuner/base_tuner.py b/deepspeed/autotuning/tuner/base_tuner.py
old mode 100644
new mode 100755
index fbdb16dacb533396fe23c9789778779913cae88e..89eb2d384c94e3dc851bdb5a1e6ef08a3d21ac88
--- a/deepspeed/autotuning/tuner/base_tuner.py
+++ b/deepspeed/autotuning/tuner/base_tuner.py
@@ -1,12 +1,11 @@
-import atexit
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import sys
 
 from deepspeed.autotuning.constants import *
 from deepspeed.autotuning.utils import write_experiments
 from deepspeed.utils import logger
 
-import json
-
 
 class BaseTuner:
     def __init__(self, exps, resource_manager, metric):
diff --git a/deepspeed/autotuning/tuner/cost_model.py b/deepspeed/autotuning/tuner/cost_model.py
old mode 100644
new mode 100755
index c311659426bf05a0ada0adc41915384d6599e0df..858ab6d3d1ea29cc0da63eb2671c69ee47dff909
--- a/deepspeed/autotuning/tuner/cost_model.py
+++ b/deepspeed/autotuning/tuner/cost_model.py
@@ -1,4 +1,4 @@
-import numpy as np
+'''Copyright The Microsoft DeepSpeed Team'''
 
 from .utils import *
 
diff --git a/deepspeed/autotuning/tuner/index_based_tuner.py b/deepspeed/autotuning/tuner/index_based_tuner.py
old mode 100644
new mode 100755
index f7233f2e8d752d094729ad3663daedf3710a1f26..f19694871c538f985982c8fc141c3853285a1b26
--- a/deepspeed/autotuning/tuner/index_based_tuner.py
+++ b/deepspeed/autotuning/tuner/index_based_tuner.py
@@ -1,6 +1,6 @@
-import random
+'''Copyright The Microsoft DeepSpeed Team'''
 
-from deepspeed.utils import logger
+import random
 
 from .base_tuner import BaseTuner
 
diff --git a/deepspeed/autotuning/tuner/model_based_tuner.py b/deepspeed/autotuning/tuner/model_based_tuner.py
old mode 100644
new mode 100755
index d8bc2b499f3d343a9510c38029b5f1254c92821f..ec475005abe007ffe5918a46edcca697e0f3da87
--- a/deepspeed/autotuning/tuner/model_based_tuner.py
+++ b/deepspeed/autotuning/tuner/model_based_tuner.py
@@ -1,8 +1,8 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import hjson
-import numpy as np
-from deepspeed.utils import logger
 
-from ..constants import AUTOTUNING, AUTOTUNING_METRIC_PATH, AUTOTUNING_METRIC_DEFAULT
+from ..constants import AUTOTUNING, AUTOTUNING_METRIC_PATH
 from .base_tuner import BaseTuner
 from .cost_model import XGBoostCostModel
 from .utils import *
diff --git a/deepspeed/autotuning/tuner/utils.py b/deepspeed/autotuning/tuner/utils.py
old mode 100644
new mode 100755
index f87c7563966ad8ecfb2012e4968a2c0f9d61dcfa..f6719f0b37d2d6e9a7164c7dad0d8e717c10da76
--- a/deepspeed/autotuning/tuner/utils.py
+++ b/deepspeed/autotuning/tuner/utils.py
@@ -1,3 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import numpy as np
 import itertools
 from ..utils import *
diff --git a/deepspeed/autotuning/utils.py b/deepspeed/autotuning/utils.py
index 5cfa92cd991180da67ad3184ad1d884551ad77e7..40da7ec9613ac486b6d37e7270502bd570e246a3 100644
--- a/deepspeed/autotuning/utils.py
+++ b/deepspeed/autotuning/utils.py
@@ -1,10 +1,10 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import re
 import collections.abc
 import os
 import json
 from deepspeed.runtime.constants import GRADIENT_ACCUMULATION_STEPS, TRAIN_MICRO_BATCH_SIZE_PER_GPU
-import hjson
-import sys
 import itertools
 import copy
 
@@ -35,23 +35,11 @@ def was_interruptted(filename):
     return False
 
 
-def was_interruptted(filename):
-    if not os.path.exists(filename):
-        return "stderr.log does not exist"
-    with open(filename) as f:
-        for line in f:
-            s = "KeyboardInterrupt"
-            idx = line.find(s)
-            if idx != -1:
-                return True
-    return False
-
-
 def find_replace_str(value, replace_dict):
     if not isinstance(value, str):
         return str(value)
 
-    matches = re.findall("\$[A-Za-z0-9_]+", value)
+    matches = re.findall(r"\$[A-Za-z0-9_]+", value)
     for var in matches:
         var_key = var.replace("$", "").lower()
         if var_key == "nvme_path":
diff --git a/deepspeed/checkpoint/__init__.py b/deepspeed/checkpoint/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..2ef0817c724eee9ca827e710b7b7e737cae74324 100644
--- a/deepspeed/checkpoint/__init__.py
+++ b/deepspeed/checkpoint/__init__.py
@@ -0,0 +1,19 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .reshape_meg_2d import reshape_meg_2d_parallel
+
+from .deepspeed_checkpoint import DeepSpeedCheckpoint
+
+from .utils import (get_layer_ckpt_name_for_rank,
+                    get_model_ckpt_name_for_rank,
+                    get_zero_ckpt_name_for_rank)
+
+from .reshape_utils import (merge_state)
+
+from .reshape_3d_utils import (model_3d_desc, get_model_3d_descriptor)
+
+from .zero_checkpoint import ZeROCheckpoint
+
+from .universal_checkpoint import enable_universal_checkpoint
+
+from .constants import *
diff --git a/deepspeed/checkpoint/constants.py b/deepspeed/checkpoint/constants.py
index f45777025db7594e1b3e6f56bb1d39402c363cfb..787398486ca2132c89658e44847ee9a3e9dd472e 100644
--- a/deepspeed/checkpoint/constants.py
+++ b/deepspeed/checkpoint/constants.py
@@ -1,3 +1,4 @@
+'''Copyright The Microsoft DeepSpeed Team'''
 '''
     Various symbolic constants used for model checkpointing
 '''
@@ -11,15 +12,54 @@ FP32_FLAT_GROUPS = 'fp32_flat_groups'
 
 BASE_OPTIMIZER_STATE = 'base_optimizer_state'
 SINGLE_PARTITION_OF_FP32_GROUPS = "single_partition_of_fp32_groups"
-GROUPS_PADDING = 'groups_padding'
-
+GROUP_PADDINGS = 'group_paddings'
 PARTITION_COUNT = 'partition_count'
 ZERO_STAGE = 'zero_stage'
 CLIP_GRAD = 'clip_grad'
+FP32_WEIGHT_KEY = "fp32"
 
 #########################################
 # Module checkpoint keys
 #########################################
+PARAM = 'param'
 PARAM_SHAPES = 'param_shapes'
 BUFFER_NAMES = 'buffer_names'
+
+#########################################
+# Checkpoint naming constants
+#########################################
+MODEL_FILE_PREFIX = 'mp_rank_'
+ZERO_FILE_PREFIX = 'zero_pp_rank_'
+OPTIM_FILE_SUFFIX = '_optim_states.pt'
+MODEL_FILE_SUFFIX = '_model_states.pt'
+LAYER_FILE_PREFIX = 'layer_'
+BF16_ZERO_FILE_PREFIX = 'bf16_' + ZERO_FILE_PREFIX
+FP16_ZERO_FILE_PREFIX = 'fp16_' + ZERO_FILE_PREFIX
+
+#########################################
+# Checkpoint utility keys
+#########################################
 DS_VERSION = 'ds_version'
+
+#########################################
+# Universal Checkpoint keys
+#########################################
+UNIVERSAL_CHECKPOINT_INFO = 'universal_checkpoint_info'
+UNIVERSAL_CHECKPOINT_VERSION_KEY = 'universal_checkpoint_version'
+# Reserve version 0.1  for the hardcoded logic used in BLOOM-176B training
+UNIVERSAL_CHECKPOINT_VERSION_VALUE = 0.2
+
+# Vocabulary padding
+VOCAB_DIVISIBILITY_PADDING_TENSOR = 'vocab_divisibility_padding_tensor'
+PADDED_VOCAB_SIZE = 'padded_vocab_size'
+ORIGINAL_VOCAB_SIZE = 'original_vocab_size'
+
+# Parameter splitting/merging
+PARAM_SLICE_MAPPINGS = 'param_slice_mappings'
+CAT_DIM = "cat_dim"
+
+# Regex list of parameters that require special handling
+VOCABULARY_PARAMETER_PATTERNS = 'vocabulary_parameter_patterns'
+PIPELINE_REPLICATED_PARAMETER_PATTERNS = 'pipeline_replicated_parameter_patterns'
+PARAMETER_TO_AVERAGE_PATTERNS = 'parameter_to_average_patterns'
+PARAMETER_WITH_ROW_PARALLELISM_PATTERNS = 'parameter_with_row_parallelism_patterns'
diff --git a/deepspeed/checkpoint/deepspeed_checkpoint.py b/deepspeed/checkpoint/deepspeed_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1a31b0fb4a5970ad500558cbbbcbe4432af1337
--- /dev/null
+++ b/deepspeed/checkpoint/deepspeed_checkpoint.py
@@ -0,0 +1,317 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
+from typing import Dict
+import torch
+
+from .reshape_3d_utils import model_3d_desc
+from .reshape_utils import (basic_folder_validation,
+                            merge_state,
+                            partition_data,
+                            get_files,
+                            get_files_with_prefix)
+
+from .constants import (MODEL_FILE_PREFIX, LAYER_FILE_PREFIX)
+
+from .reshape_meg_2d import reshape_meg_2d_parallel, meg_2d_parallel_map
+from .zero_checkpoint import ZeROCheckpoint
+from .constants import *
+
+EMBEDDING_LAYER_INDEX = 0
+FINAL_LAYER_NORM_INDEX = -1
+ARGS_KEY = 'args'
+CHECKPOINT_INFO_KEY = 'checkpoint_info'
+ITERATION_KEY = 'iteration'
+
+SEQUENTIAL_LAYERS = [
+    'input_layernorm.weight',
+    'input_layernorm.bias',
+    'self_attention.dense.bias',
+    'post_attention_layernorm.weight',
+    'post_attention_layernorm.bias',
+    'mlp.dense_4h_to_h.bias',
+    'position_embeddings.weight'
+]
+
+LAYER_CONCAT_DIM = {'self_attention.dense.weight': 1, 'mlp.dense_4h_to_h.weight': 1}
+
+
+class DeepSpeedCheckpoint(object):
+    def __init__(self, dir, tp_degree=None, pp_degree=None, dp_degree=None):
+        self.dir = dir
+        self._validate_folder(dir)
+
+        self.zero_checkpoint = ZeROCheckpoint(dir)
+
+        self.file_list = get_files(dir)
+        self.layer_files = get_files_with_prefix(self.file_list, LAYER_FILE_PREFIX)
+        self.mp_rank_files = get_files_with_prefix(self.file_list, MODEL_FILE_PREFIX)
+
+        self.layer_keys = self._get_layer_keys()
+        self.layer_count = len(self.layer_keys)
+
+        self.tp_degree = self.zero_checkpoint.get_src_tp_degree(
+        ) if tp_degree is None else tp_degree
+        self.pp_degree = self.zero_checkpoint.get_src_pp_degree(
+        ) if pp_degree is None else pp_degree
+        self.dp_degree = self.zero_checkpoint.get_src_dp_degree(
+        ) if dp_degree is None else dp_degree
+
+        self.original_world_size = self.zero_checkpoint.get_src_tp_degree(
+        ) * self.zero_checkpoint.get_src_pp_degree(
+        ) * self.zero_checkpoint.get_src_dp_degree()
+        self.world_size = self.tp_degree * self.pp_degree * self.dp_degree
+
+        self.old_2d_map = meg_2d_parallel_map(self.zero_checkpoint.get_src_pp_degree(),
+                                              self.zero_checkpoint.get_src_tp_degree())
+        self.old_2d_map.simple_init()
+        self.new_2d_map = reshape_meg_2d_parallel(
+            old_pp_degree=self.zero_checkpoint.get_src_pp_degree(),
+            old_tp_degree=self.zero_checkpoint.get_src_tp_degree(),
+            new_pp_degree=self.pp_degree,
+            new_tp_degree=self.tp_degree)
+
+        if self.is_change_pp_degree() or self.is_change_tp_degree(
+        ) or self.is_change_dp_degree():
+            self.zero_checkpoint.reshape(
+                model_3d_desc(self.pp_degree,
+                              self.tp_degree,
+                              self.dp_degree))
+
+        self.global_state = {}
+
+        self._sanity_check()
+        self.pp_to_transformer_map = self._build_pp_transformer_map()
+        self.transformer_file_map = self._build_transformer_file_map()
+        self.tp_to_embedding_map = self._build_tp_other_layer_map(EMBEDDING_LAYER_INDEX)
+        self.tp_to_final_norm_map = self._build_tp_other_layer_map(
+            FINAL_LAYER_NORM_INDEX)
+        self._build_global_state()
+
+    def is_change_tp_degree(self):
+        return self.tp_degree != self.zero_checkpoint.get_src_tp_degree()
+
+    def is_change_pp_degree(self):
+        return self.pp_degree != self.zero_checkpoint.get_src_pp_degree()
+
+    def is_change_dp_degree(self):
+        return self.dp_degree != self.zero_checkpoint.get_src_dp_degree()
+
+    def show_2d_mapping(self):
+        print(f'reshaped 2d map ---- begin')
+
+        for i in range(self.pp_degree):
+            for j in range(self.tp_degree):
+                file_list = self.get_2d_parallel_files(pp_index=i, tp_index=j)
+                print(f'[{i}, {j}] = {file_list}')
+
+        print(f'reshaped 2d map ---- end')
+
+    def show_tp_embedding_map(self):
+        self._dump_mapping(self.tp_to_embedding_map, 'tp_to_embedding_layers')
+
+    def show_tp_final_norm_map(self):
+        self._dump_mapping(self.tp_to_final_norm_map, 'tp_to_final_norm_layers')
+
+    def show_pp_tranformer_map(self):
+        self._dump_mapping(self.pp_to_transformer_map, 'pp_to_tranformer_layers')
+
+    def show_transformer_file_map(self):
+        self._dump_mapping(self.transformer_file_map, 'rank_to_tranformer_files')
+
+    def _build_global_state(self):
+        sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
+        self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0)
+        self.global_state[ARGS_KEY] = sd.get(ARGS_KEY, None)
+
+    def get_zero_checkpoint_state(self, pp_index, tp_index, dp_index) -> dict:
+        return self.zero_checkpoint.get_state_for_rank(pp_index=pp_index,
+                                                       tp_index=tp_index,
+                                                       dp_index=dp_index,
+                                                       keys_to_ignore=[PARAM_SHAPES])
+
+    def get_zero_files(self, pp_index, tp_index, dp_index) -> list:
+        return self.zero_checkpoint.get_files_for_rank(pp_index=pp_index,
+                                                       tp_index=tp_index,
+                                                       dp_index=dp_index)
+
+    def get_embedding_layer_id(self):
+        return self.layer_keys[EMBEDDING_LAYER_INDEX]
+
+    def get_final_norm_layer_id(self):
+        return self.layer_keys[FINAL_LAYER_NORM_INDEX]
+
+    def get_iteration(self):
+        if not ITERATION_KEY in self.global_state:
+            sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
+            self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0)
+
+        return self.global_state[ITERATION_KEY]
+
+    def get_embedding_state(self, tp_index: int) -> Dict:
+        assert tp_index in self.tp_to_embedding_map.keys()
+        sd_list = [
+            torch.load(fname,
+                       map_location=torch.device('cpu'))
+            for fname in self.tp_to_embedding_map[tp_index]
+        ]
+        sd = self._merge_state_dicts(sd_list)
+        return sd
+
+    def get_embedding_files(self, tp_index: int) -> list:
+        assert tp_index in self.tp_to_embedding_map.keys()
+        return self.tp_to_embedding_map[tp_index]
+
+    def _get_checkpoint_value(self, key):
+        if not key in self.global_state:
+            sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
+            self.global_state[key] = sd.get(key, None)
+
+        return self.global_state[key]
+
+    def get_args(self):
+        return self._get_checkpoint_value(ARGS_KEY)
+
+    def get_checkpoint_info(self, info_key=CHECKPOINT_INFO_KEY):
+        return self._get_checkpoint_value(info_key)
+
+    def get_2d_parallel_state(self, tp_index: int, pp_index: int) -> dict:
+        assert tp_index < self.tp_degree
+        assert pp_index < self.pp_degree
+        fname_list = self.get_2d_parallel_files(tp_index=tp_index, pp_index=pp_index)
+        sd_list = [
+            torch.load(fname,
+                       map_location=torch.device('cpu')) for fname in fname_list
+        ]
+
+        merged_sd = None
+        for sd in sd_list:
+            if merged_sd is None:
+                merged_sd = sd
+            else:
+                merged_sd = merge_state(merged_sd, sd)
+
+        return merged_sd
+
+    def get_transformer_state(self, tp_index: int, pp_index: int) -> list:
+        assert tp_index < self.tp_degree
+        assert pp_index < self.pp_degree
+        t_list = []
+        for fname_list in self.transformer_file_map[(tp_index, pp_index)]:
+            sd_list = [
+                torch.load(fname,
+                           map_location=torch.device('cpu')) for fname in fname_list
+            ]
+            sd = self._merge_state_dicts(sd_list)
+            t_list.append(sd)
+        return t_list
+
+    def get_pp_transformer_map(self, pp_index: int) -> list:
+        assert pp_index < self.pp_degree
+        return self.pp_to_transformer_map[pp_index]
+
+    def get_final_norm_state(self, tp_index: int) -> Dict:
+        assert tp_index in self.tp_to_final_norm_map.keys()
+        sd = torch.load(self.tp_to_final_norm_map[tp_index][0],
+                        map_location=torch.device('cpu'))
+        return sd
+
+    def get_final_norm_files(self, tp_index: int) -> list:
+        assert tp_index in self.tp_to_final_norm_map.keys()
+        return self.tp_to_final_norm_map[tp_index]
+
+    def _build_tp_other_layer_map(self, layer_index: int):
+        assert layer_index < len(self.layer_files)
+        layer_files = get_files_with_prefix(self.layer_files,
+                                            self.layer_keys[layer_index])
+        layer_file_partitions = partition_data(layer_files, self.tp_degree)
+        data_map = {i: flist for i, flist in enumerate(layer_file_partitions)}
+        return data_map
+
+    def get_2d_parallel_files(self, tp_index: int, pp_index: int) -> list:
+        assert tp_index < self.tp_degree
+        assert pp_index < self.pp_degree
+        file_indices = self.new_2d_map.get_data(pp_index=pp_index, tp_index=tp_index)
+        return [self.mp_rank_files[i] for i in file_indices]
+
+    def _build_pp_transformer_map(self):
+        data_map = {}
+        transformer_layers = self.layer_keys[1:-1]
+        layers_per_pp = len(transformer_layers) // self.pp_degree
+        data_map = {
+            i: transformer_layers[i * layers_per_pp:(i + 1) * layers_per_pp]
+            for i in range(0,
+                           self.pp_degree)
+        }
+        return data_map
+
+    def _dump_mapping(self, data_map, map_tag=None):
+        if map_tag is not None:
+            print(f'Dump mapping: {map_tag}')
+        for k, v in data_map.items():
+            print(f'{k} = {v}')
+
+    def _build_transformer_file_map(self):
+        transformer_layer_keys = self.layer_keys[1:-1]
+        file_map = {}
+        # XXX: this is not guaranteed
+        layers_per_pp = len(transformer_layer_keys) // self.pp_degree
+        if layers_per_pp == 0:
+            layers_per_pp = 1
+        #print(f"{transformer_layer_keys} {layers_per_pp}")
+        for key_index, layer_key in enumerate(transformer_layer_keys):
+            pp_index = key_index // layers_per_pp
+            layer_files = get_files_with_prefix(self.layer_files, layer_key)
+            layer_file_partitions = partition_data(layer_files, self.tp_degree)
+            for tp_index in range(self.tp_degree):
+                map_key = (tp_index, pp_index)
+                if not map_key in file_map.keys():
+                    file_map[map_key] = []
+                file_map[map_key].append(layer_file_partitions[tp_index])
+
+        return file_map
+
+    def _sanity_check(self):
+        assert len(self.mp_rank_files) % self.tp_degree == 0
+        assert len(self.layer_keys) > 2
+        assert self.zero_checkpoint.num_files % (self.pp_degree * self.tp_degree) == 0
+        # XXX: fix me - isn't always the case
+        # only true with  --pp-partition-method 'type:transformer|embedding' \
+        # assert (len(self.layer_keys) - 2) % self.pp_degree == 0
+
+    def validate_files(self):
+        for file in self.file_list:
+            if not os.path.isfile(file):
+                print(f'Error: {file} is not existent')
+
+    def _get_layer_keys(self):
+        key_set = set()
+        key_len = len(LAYER_FILE_PREFIX) + 2
+        for file_path in self.layer_files:
+            _, fname = os.path.split(file_path)
+            key_set.add(fname[:key_len])
+        return sorted(list(key_set))
+
+    def _merge_state_dicts(self, sd_list):
+        merged_sd = {}
+        for key in sd_list[0].keys():
+            if not key in SEQUENTIAL_LAYERS:
+                cat_dim = LAYER_CONCAT_DIM.get(key, 0)
+                merged_sd[key] = torch.cat([sd[key] for sd in sd_list], dim=cat_dim)
+            else:
+                merged_sd[key] = sd_list[0][key]
+
+        return merged_sd
+
+    def _validate_folder(self, dir):
+        basic_folder_validation(dir)
+
+        file_list = get_files(dir)
+
+        for file_prefix in [
+                MODEL_FILE_PREFIX,
+                LAYER_FILE_PREFIX,
+                f'{LAYER_FILE_PREFIX}01'
+        ]:
+            ckpt_files = get_files_with_prefix(file_list, file_prefix)
+            assert len(ckpt_files) > 0, f'{dir} seems a bogus DeepSpeed checkpoint folder: Cannot find {file_prefix}* files in there.'
diff --git a/deepspeed/checkpoint/reshape_3d_utils.py b/deepspeed/checkpoint/reshape_3d_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..15faffb2a680bb310c8a31986977d4cc85a0af95
--- /dev/null
+++ b/deepspeed/checkpoint/reshape_3d_utils.py
@@ -0,0 +1,120 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .reshape_utils import (get_files,
+                            get_files_with_prefix,
+                            partition_data,
+                            get_zero_files)
+
+from .constants import (MODEL_FILE_PREFIX, LAYER_FILE_PREFIX)
+
+from .reshape_meg_2d import (reshape_meg_2d_parallel, meg_2d_parallel_map)
+
+PP_DIM = 'PP'
+TP_DIM = 'TP'
+DP_DIM = 'DP'
+
+
+class model_3d_desc(object):
+    def __init__(self, pp_degree=1, tp_degree=1, dp_degree=1):
+        self.pp_degree = pp_degree
+        self.tp_degree = tp_degree
+        self.dp_degree = dp_degree
+
+    def reshape(self, target_3d_desc, verbose=False):
+        valid_reshape, reshape_errors = self.can_reshape(target_3d_desc)
+        assert valid_reshape, ','.join(reshape_errors)
+        tgt_2d_map = reshape_meg_2d_parallel(old_pp_degree=self.pp_degree,
+                                             old_tp_degree=self.tp_degree,
+                                             new_pp_degree=target_3d_desc.pp_degree,
+                                             new_tp_degree=target_3d_desc.tp_degree,
+                                             verbose=verbose)
+
+        flat_3d_map = flatten_dp_dimension(meg_2d_map=tgt_2d_map,
+                                           src_2d_size=self.pp_degree * self.tp_degree,
+                                           dp_degree=self.dp_degree)
+
+        return unflatten_dp_dimension(meg_2d_map=flat_3d_map,
+                                      dp_degree=target_3d_desc.dp_degree)
+
+    def get_desc(self):
+        return f'{PP_DIM},{TP_DIM},{DP_DIM} = ({self.pp_degree}, {self.tp_degree}, {self.dp_degree})'
+
+    def world_size(self):
+        return self.pp_degree * self.tp_degree * self.dp_degree
+
+    def is_valid(self, pp_index, tp_index, dp_index):
+        err_msg = []
+        valid = True
+        for index, degree, dim_name in [
+            (pp_index, self.pp_degree, PP_DIM),
+            (tp_index, self.tp_degree, TP_DIM),
+            (dp_index, self.dp_degree, DP_DIM)]:
+            if index >= degree:
+                valid = False
+                err_msg.append(
+                    f'{dim_name} indexing error: index {index} >= degree {degree}')
+
+        return valid, err_msg
+
+    def can_reshape(self, target_3d_desc):
+        err_msg = []
+        if target_3d_desc.pp_degree > self.pp_degree:
+            err_msg.append(
+                f'Expansion reshape not supported - {PP_DIM}: {self.pp_degree} ---> {target_3d_desc.pp_degree}'
+            )
+
+        if target_3d_desc.tp_degree > self.tp_degree:
+            err_msg.append(
+                f'Expansion reshape not supported - {TP_DIM}: {self.tp_degree} ---> {target_3d_desc.tp_degree}'
+            )
+
+        if target_3d_desc.dp_degree > self.dp_degree:
+            err_msg.append(
+                f'Expansion reshape not supported - {DP_DIM}: {self.dp_degree} ---> {target_3d_desc.dp_degree}'
+            )
+
+        return len(err_msg) == 0, err_msg
+
+
+def get_model_3d_descriptor(dir):
+    file_list = get_files(dir)
+    zero_file_list = get_zero_files(dir)
+    num_pp0_files = len(get_files_with_prefix(file_list, f'{LAYER_FILE_PREFIX}01'))
+    if num_pp0_files > 0:
+        tp_degree = num_pp0_files
+        pp_degree = len(get_files_with_prefix(file_list, MODEL_FILE_PREFIX)) // tp_degree
+        dp_degree = max(1, len(zero_file_list) // (pp_degree * tp_degree))
+    else:
+        tp_degree = len(get_files_with_prefix(file_list, MODEL_FILE_PREFIX))
+        dp_degree = max(1, len(zero_file_list) // tp_degree)
+        pp_degree = 0
+
+    return model_3d_desc(pp_degree, tp_degree, dp_degree)
+
+
+def flatten_dp_dimension(meg_2d_map, src_2d_size, dp_degree):
+    new_meg_2d_map = meg_2d_parallel_map(meg_2d_map.pp_degree, meg_2d_map.tp_degree)
+    for pp_index in range(meg_2d_map.pp_degree):
+        for tp_index in range(meg_2d_map.tp_degree):
+            dp0_indices = meg_2d_map.get_data(pp_index, tp_index)
+            for idx in dp0_indices:
+                dpX_indices = [idx + (i * src_2d_size) for i in range(dp_degree)]
+                new_meg_2d_map.add_data(pp_index, tp_index, dpX_indices)
+    return new_meg_2d_map
+
+
+def unflatten_dp_dimension(meg_2d_map, dp_degree):
+    pp_degree = meg_2d_map.pp_degree
+    tp_degree = meg_2d_map.tp_degree
+    meg_2d_map_list = [
+        meg_2d_parallel_map(pp_degree=pp_degree,
+                            tp_degree=tp_degree) for _ in range(dp_degree)
+    ]
+    for pp_index in range(pp_degree):
+        for tp_index in range(tp_degree):
+            flat_dp_indices = meg_2d_map.get_data(pp_index, tp_index)
+            partitioned_dp_indices = partition_data(flat_dp_indices, dp_degree)
+            for dp_indices, _2d_map in zip(partitioned_dp_indices, meg_2d_map_list):
+                _2d_map.add_data(pp_index, tp_index, dp_indices)
+
+    return meg_2d_map_list
diff --git a/deepspeed/checkpoint/reshape_meg_2d.py b/deepspeed/checkpoint/reshape_meg_2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..4800b55583b83b6d15ab085c9e921d25b653d421
--- /dev/null
+++ b/deepspeed/checkpoint/reshape_meg_2d.py
@@ -0,0 +1,228 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .reshape_utils import partition_data
+
+
+class meg_2d_parallel_map(object):
+    def __init__(self, pp_degree, tp_degree):
+        self.pp_degree = pp_degree
+        self.tp_degree = tp_degree
+        self.map = {}
+
+    def simple_init(self):
+        self.map = {
+            self._make_key(i // self.tp_degree,
+                           i % self.tp_degree): [i]
+            for i in range(self.pp_degree * self.tp_degree)
+        }
+
+    def add_data(self, pp_index, tp_index, data):
+        self._validate_indices(pp_index, tp_index)
+        assert type(data) is list
+
+        key = self._make_key(pp_index, tp_index)
+        if not key in self.map.keys():
+            self.map[key] = []
+        self.map[key] += data
+
+    def get_data(self, pp_index=None, tp_index=None):
+        self._validate_indices(pp_index, tp_index)
+        pp_indices = list(range(self.pp_degree)) if pp_index is None else [pp_index]
+        tp_indices = list(range(self.tp_degree)) if tp_index is None else [tp_index]
+
+        result = []
+        for i in pp_indices:
+            for j in tp_indices:
+                result += self.map[self._make_key(i, j)]
+
+        return result
+
+    def print_data(self, tag):
+        print(f'{tag}')
+        for key, value in self.map.items():
+            print(f'{key} = {value}')
+
+    def _validate_indices(self, pp_index, tp_index):
+        assert pp_index is None or pp_index < self.pp_degree
+        assert tp_index is None or tp_index < self.tp_degree
+
+    def _make_key(self, i, j):
+        return f'{i},{j}'
+
+
+def _reshape_tp_dimension(old_2d_map, new_tp_degree):
+    old_pp_degree = old_2d_map.pp_degree
+    new_2d_map = meg_2d_parallel_map(old_pp_degree, new_tp_degree)
+    for i in range(old_pp_degree):
+        ranks_for_pp_index = old_2d_map.get_data(pp_index=i, tp_index=None)
+        split_ranks = partition_data(ranks_for_pp_index, new_tp_degree)
+        for j in range(new_tp_degree):
+            new_2d_map.add_data(i, j, split_ranks[j])
+
+    return new_2d_map
+
+
+def _reshape_pp_dimension(old_2d_map, new_pp_degree):
+    old_tp_degree = old_2d_map.tp_degree
+    new_2d_map = meg_2d_parallel_map(new_pp_degree, old_tp_degree)
+    for i in range(old_tp_degree):
+        ranks_for_tp_index = old_2d_map.get_data(pp_index=None, tp_index=i)
+        split_ranks = partition_data(ranks_for_tp_index, new_pp_degree)
+        for j in range(new_pp_degree):
+            new_2d_map.add_data(j, i, split_ranks[j])
+
+    return new_2d_map
+
+
+def reshape_meg_2d_parallel(old_pp_degree,
+                            old_tp_degree,
+                            new_pp_degree,
+                            new_tp_degree,
+                            verbose=False):
+    assert new_pp_degree <= old_pp_degree
+    assert new_tp_degree <= old_tp_degree
+
+    old_2d_map = meg_2d_parallel_map(old_pp_degree, old_tp_degree)
+    old_2d_map.simple_init()
+    if verbose:
+        old_2d_map.print_data(f'original_2d_map:')
+
+    if old_tp_degree != new_tp_degree:
+        new_tp_map = _reshape_tp_dimension(old_2d_map, new_tp_degree)
+    else:
+        new_tp_map = old_2d_map
+    if verbose:
+        new_tp_map.print_data(f'after_tp_reshape:')
+
+    if old_pp_degree != new_pp_degree:
+        final_map = _reshape_pp_dimension(new_tp_map, new_pp_degree)
+    else:
+        final_map = new_tp_map
+
+    if verbose:
+        final_map.print_data(f'final_2d_map:')
+
+    return final_map
+
+
+def get_mpu_ranks(tp_size=1, pp_size=1, dp_size=1, virtual_pp_size=None):
+    """
+    Initialize model data parallel groups.
+
+    Arguments:
+        tp_size: number of GPUs used to parallelize model tensor.
+        pp_size: number of GPUs used to parallelize model pipeline.
+        dp_size: number of GPUs used to parallelize model data.
+
+    Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
+    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
+    the model pipeline. The present function will
+    create 8 tensor model-parallel groups, 4 pipeline model-parallel groups
+    and 8 data-parallel groups as:
+        8 data_parallel groups:
+            [g0, g2], [g1, g3], [g4, g6], [g5, g7], [g8, g10], [g9, g11], [g12, g14], [g13, g15]
+        8 tensor model-parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7], [g8, g9], [g10, g11], [g12, g13], [g14, g15]
+        4 pipeline model-parallel groups:
+            [g0, g4, g8, g12], [g1, g5, g9, g13], [g2, g6, g10, g14], [g3, g7, g11, g15]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+
+    world_size = tp_size * pp_size * dp_size
+
+    print(f"\n\n*** tp={tp_size}, pp={pp_size}, dp={dp_size}, world={world_size}")
+
+    tensor_model_parallel_size = min(tp_size, world_size)
+    pipeline_model_parallel_size = min(pp_size, world_size)
+    data_parallel_size = world_size // (tensor_model_parallel_size *
+                                        pipeline_model_parallel_size)
+
+    num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size
+    num_pipeline_model_parallel_groups = world_size // pipeline_model_parallel_size
+    num_data_parallel_groups = world_size // data_parallel_size
+
+    # Build the data-parallel groups.
+    all_dp_group_ranks = []
+    for i in range(pipeline_model_parallel_size):
+        start_rank = i * num_pipeline_model_parallel_groups
+        end_rank = (i + 1) * num_pipeline_model_parallel_groups
+        for j in range(tensor_model_parallel_size):
+            ranks = range(start_rank + j, end_rank, tensor_model_parallel_size)
+            all_dp_group_ranks.append(list(ranks))
+
+    print("DP", all_dp_group_ranks)
+
+    # Build the model-parallel groups.
+    all_pp_group_ranks = []
+    for i in range(data_parallel_size):
+        ranks = [
+            data_parallel_group_ranks[i]
+            for data_parallel_group_ranks in all_dp_group_ranks
+        ]
+        all_pp_group_ranks.append(list(ranks))
+
+    print(f"PP", all_pp_group_ranks)
+
+    # Build the tensor model-parallel groups.
+    all_tp_group_ranks = []
+    for i in range(num_tensor_model_parallel_groups):
+        ranks = range(i * tensor_model_parallel_size,
+                      (i + 1) * tensor_model_parallel_size)
+        all_tp_group_ranks.append(list(ranks))
+
+    print(f"TP", all_tp_group_ranks)
+
+    return all_tp_group_ranks, all_pp_group_ranks, all_dp_group_ranks
+
+    # # Build the pipeline model-parallel groups and embedding groups
+    # # (first and last rank in each pipeline model-parallel group).
+    # for i in range(num_pipeline_model_parallel_groups):
+    #     ranks = range(i, world_size,
+    #                   num_pipeline_model_parallel_groups)
+    #     print(f"EMB{i}", list(ranks))
+
+
+def reshape(src, tgt):
+    """
+    reshape([tp_size_src, pp_size_src, dp_size_src],
+            [tp_size_tgt, pp_size_tgt, dp_size_tgt])
+    """
+
+    print(f"\n\n*** Reshaping: {src} => {tgt}")
+
+    tp_size_src, pp_size_src, dp_size_src = src
+    tp_size_tgt, pp_size_tgt, dp_size_tgt = tgt
+
+    tp_ranks1, pp_ranks1, dp_ranks1 = get_mpu_ranks(tp_size=tp_size_src, pp_size=pp_size_src, dp_size=dp_size_src)
+    tp_ranks2, pp_ranks2, dp_ranks2 = get_mpu_ranks(tp_size=tp_size_tgt, pp_size=pp_size_src, dp_size=dp_size_src)
+    tp_ranks3, pp_ranks3, dp_ranks3 = get_mpu_ranks(tp_size=tp_size_tgt, pp_size=pp_size_tgt, dp_size=dp_size_src)
+
+    # handle tp contraction first
+    print("\n*** TP contraction:")
+
+    for i, r in enumerate(tp_ranks1):
+        print(f'{tp_ranks1[i]} => {tp_ranks2[i]}')
+
+    # handle pp contraction next
+
+    print("\n*** PP contraction:")
+
+    for i, r in enumerate(pp_ranks1):
+        print(f'{pp_ranks2[i]} => {pp_ranks3[i]}')
+
+
+# easy
+#reshape([2,2,1],[1,1,1])
+
+# probably need more logic to suggest how to pack
+#reshape([4,4,1],[2,2,1])
+
+#reshape([2,4,2], [8,32,1])
+
+# get_mpu_ranks(2,2,2)
+# get_mpu_ranks(4,2,1)
+# get_mpu_ranks(2,4,1)
+# get_mpu_ranks(1,1,8)
diff --git a/deepspeed/checkpoint/reshape_utils.py b/deepspeed/checkpoint/reshape_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d6b272f43e3ef202334eef8fa46960fc5b7cf23
--- /dev/null
+++ b/deepspeed/checkpoint/reshape_utils.py
@@ -0,0 +1,100 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
+import torch
+from collections import OrderedDict
+from .constants import (ZERO_FILE_PREFIX, FP16_ZERO_FILE_PREFIX, BF16_ZERO_FILE_PREFIX)
+
+
+def basic_folder_validation(dir):
+    assert os.path.exists(dir), f'{dir} path does not exist'
+    assert os.path.isdir(dir), f'{dir} is not a folder'
+
+
+def get_files_with_prefix(all_files, prefix):
+    file_list = []
+    for file_path in all_files:
+        _, fname = os.path.split(file_path)
+        if fname.startswith(prefix):
+            file_list.append(file_path)
+
+    return sorted(file_list)
+
+
+def validate_files(file_list):
+    for file in file_list:
+        if not os.path.isfile(file):
+            print(f'Error: {file} is not existent')
+
+
+def get_files(dir):
+    file_list = []
+    for root, _, files in os.walk(dir):
+        for file in files:
+            file_list.append(os.path.join(root, file))
+    return file_list
+
+
+def get_zero_files(dir):
+    file_list = get_files(dir)
+    for prefix in [ZERO_FILE_PREFIX, FP16_ZERO_FILE_PREFIX, BF16_ZERO_FILE_PREFIX]:
+        zero_files = get_files_with_prefix(file_list, prefix)
+        if len(zero_files) > 0:
+            return zero_files
+
+    return []
+
+
+def partition_data(data_list, num_partitions):
+    num_elems = len(data_list)
+    assert num_elems % num_partitions == 0
+    partition_size = num_elems // num_partitions
+    partitions_list = [
+        data_list[i:i + partition_size] for i in range(0,
+                                                       num_elems,
+                                                       partition_size)
+    ]
+    return partitions_list
+
+
+def _key_list_to_string(key_list):
+    return '.'.join(key_list)
+
+
+def merge_state_dict(dict_a, dict_b, key_list):
+    merged_dict = type(dict_a)({})
+
+    for key, value in dict_b.items():
+        if key in dict_a.keys():
+            merged_dict[key] = merge_state(dict_a[key], dict_b[key], [str(key)])
+        else:
+            merged_dict[key] = value
+
+    return merged_dict
+
+
+def merge_state_list(list_a, list_b, key_list):
+    if len(list_a) != len(list_b):
+        print(f'{_key_list_to_string(key_list)}')
+        raise ValueError(
+            f'Cannot merge lists of different lengths, a = {len(list_a)} b = {len(list_b)}'
+        )
+
+    return [merge_state(a, b, key_list) for a, b in zip(list_a, list_b)]
+
+
+def merge_state(state_a, state_b, key_list=[]):
+    if type(state_a) != type(state_b):
+        key_list_string = _key_list_to_string(key_list)
+        print(f'key_list = {key_list_string}')
+        raise ValueError(
+            f'Cannot merge two states of types {type(state_a)} and type {type(state_b)}')
+
+    if type(state_a) in (dict, OrderedDict):
+        return merge_state_dict(state_a, state_b, key_list)
+    elif type(state_a) in (list, tuple):
+        return type(state_a)(merge_state_list(state_a, state_b, key_list))
+    elif torch.is_tensor(state_a):
+        return torch.cat([state_a, state_b], 0)
+    else:
+        return state_a
diff --git a/deepspeed/checkpoint/universal_checkpoint.py b/deepspeed/checkpoint/universal_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..b58de4871031f0a26a6a88c332015d14fb85d5b9
--- /dev/null
+++ b/deepspeed/checkpoint/universal_checkpoint.py
@@ -0,0 +1,108 @@
+"""
+Copyright 2022 The Microsoft DeepSpeed Team
+"""
+import os
+import torch
+import types
+from .constants import (FP32_WEIGHT_KEY,
+                        PARAM,
+                        VOCAB_DIVISIBILITY_PADDING_TENSOR,
+                        CAT_DIM)
+
+
+def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
+    hp_mapping = self._hp_mapping
+    optim_state_keys = hp_mapping.get_optim_state_keys()
+    hp_keys = [FP32_WEIGHT_KEY] + optim_state_keys
+    checkpoint_files = {key: os.path.join(folder, f"{key}.pt") for key in hp_keys}
+
+    for file in checkpoint_files.values():
+        assert os.path.isfile(file), f'{file} is not a valid file'
+
+    for key in hp_keys:
+        ckpt_file = checkpoint_files[key]
+        ckpt_dict = torch.load(ckpt_file)
+        full_hp_param = ckpt_dict[PARAM]
+
+        # need to deal with slices that were averaged.
+        # the opposite of averaging here becomes an exact copy of the first slice
+        # I thought of 2 ways:
+        # implementation a. find a way for a client to pass a dict with patterns
+        # if any(re.search(pattern, folder) for pattern in WEIGHTS_TO_AVERAGE_PATTERNS):
+        #     tp_rank = 0
+        #     tp_world_size = 1
+        # the other approach is to assume that the saved data is correct and if full_hp_param.shape ==
+        # self.shape that means we automatically copy?
+        # implementation b.
+        # this version requires no additional data passed from the client
+        # if the shapes already match it must be slices that were averaged - so we just hack around those
+        if full_hp_param.shape == self.shape:
+            tp_rank = 0
+            tp_world_size = 1
+
+        # special case for word_embeddings weights which get padded differently depending on TP degree.
+        # the converter to universal currently strips the original padding completely so the saved
+        # weight is padding-free and we just need to add new padding depending on the target TP
+        # degree
+        vocab_divisibility_padding_tensor = ckpt_dict.get(
+            VOCAB_DIVISIBILITY_PADDING_TENSOR,
+            None)
+        if vocab_divisibility_padding_tensor is not None:
+            # In the absence of data passed from the user wrt new padded vocab specific to tp degree
+            # we can again derive that data by reverse engineering the target shapes like so:
+            padded_target_vocab_size = self.shape[0] * tp_world_size
+            if padded_target_vocab_size > full_hp_param.shape[0]:
+                # Need to expand
+                padding_size = padded_target_vocab_size - full_hp_param.shape[0]
+                # Implement the following concat in efficient way using pad
+                #full_hp_param = torch.cat((full_hp_param, padding_tensor), 0)
+                full_hp_param = torch.nn.functional.pad(full_hp_param,
+                                                        (0,
+                                                         0,
+                                                         0,
+                                                         padding_size),
+                                                        "constant",
+                                                        0)
+                full_hp_param[:-padding_size, :] = vocab_divisibility_padding_tensor
+            else:
+                # Need to shrink or keep the same
+                full_hp_param = full_hp_param[:padded_target_vocab_size, :]
+
+        full_param_numel = full_hp_param.numel()
+        tp_slice_numel = self.numel()
+        #        if key == FP32_WEIGHT_KEY and 'word_embeddings.weight' in folder:
+        #            print_rank_0(f'{full_hp_param[:10]=}', force=True)
+
+
+        assert full_param_numel == tp_world_size * tp_slice_numel, \
+            f'Loading {ckpt_file} full param numel {full_param_numel} != tensor slice numel {tp_slice_numel} * tp_world_size {tp_world_size}'
+        dst_tensor = hp_mapping.hp_fragment if key == FP32_WEIGHT_KEY else hp_mapping.get_optim_state_fragment(
+            key)
+
+        #        print(f"{full_hp_param.shape=} {full_param_numel=} {folder=}")
+        #        print(f"{dst_tensor.shape=} {dst_tensor.numel()=}{folder=}")
+
+        # since when we do many to 1 on tp we cat sometimes on dim=0 and other times on dim=1 we have to do exactly the same in reverse
+        chunk_dim = ckpt_dict.get(CAT_DIM, 0)
+
+        # this performs the opposite of cat when merging TP slices
+        tp_hp_slice = full_hp_param.chunk(tp_world_size, chunk_dim)[tp_rank]
+        tp_hp_slice = tp_hp_slice.flatten()
+
+        lp_frag_address = hp_mapping.lp_fragment_address
+        tp_hp_fragment = tp_hp_slice.narrow(0,
+                                            lp_frag_address.start,
+                                            lp_frag_address.numel)
+        assert dst_tensor.numel() == lp_frag_address.numel, \
+            f'Load checkpoint {key} dst_tensor numel {dst_tensor.numel()} != src numel {lp_frag_address.numel}'
+
+        #        print(f"{key} SHAPE: {tp_hp_slice.shape=}")
+        #        print(f"{key} SHAPE: {dst_tensor.shape=}")
+        #        print(f"{key} SHAPE: {tp_hp_fragment.shape=}")
+        dst_tensor.data.copy_(tp_hp_fragment.data)
+
+
+def enable_universal_checkpoint(param_list):
+    for param in param_list:
+        param.load_hp_checkpoint_state = types.MethodType(load_hp_checkpoint_state,
+                                                          param)
diff --git a/deepspeed/checkpoint/utils.py b/deepspeed/checkpoint/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e96d073067de72e7bbf21c50a2bc7daf9b8cfd4
--- /dev/null
+++ b/deepspeed/checkpoint/utils.py
@@ -0,0 +1,31 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
+from .constants import (MODEL_FILE_PREFIX,
+                        MODEL_FILE_SUFFIX,
+                        OPTIM_FILE_SUFFIX,
+                        ZERO_FILE_PREFIX)
+
+
+def get_model_ckpt_name_for_rank(base_folder, mp_rank_str):
+    ckpt_name = os.path.join(
+        base_folder,
+        MODEL_FILE_PREFIX + mp_rank_str + MODEL_FILE_SUFFIX,
+    )
+    return ckpt_name
+
+
+def get_zero_ckpt_name_for_rank(base_folder, dp_rank, mp_rank):
+    zero_prefix = f'{ZERO_FILE_PREFIX}{dp_rank}'
+    mp_rank_string = f'_{MODEL_FILE_PREFIX}{mp_rank:02d}'
+    zero_ckpt_name = os.path.join(
+        base_folder,
+        zero_prefix + mp_rank_string + OPTIM_FILE_SUFFIX,
+    )
+    return zero_ckpt_name
+
+
+def get_layer_ckpt_name_for_rank(base_folder, layer_id, tp_rank):
+    ckpt_file = f'{layer_id}-model_{tp_rank:02d}{MODEL_FILE_SUFFIX}'
+    ckpt_path = os.path.join(base_folder, ckpt_file)
+    return ckpt_path
diff --git a/deepspeed/checkpoint/zero_checkpoint.py b/deepspeed/checkpoint/zero_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb33e8e376716860455d899c186ba10d58f4bc50
--- /dev/null
+++ b/deepspeed/checkpoint/zero_checkpoint.py
@@ -0,0 +1,148 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+
+from .constants import (BASE_OPTIMIZER_STATE,
+                        GROUP_PADDINGS,
+                        OPTIMIZER_STATE_DICT,
+                        PARTITION_COUNT)
+
+from .reshape_utils import (basic_folder_validation, get_zero_files, merge_state)
+
+from .reshape_3d_utils import (model_3d_desc, get_model_3d_descriptor)
+
+GROUP_STATE_KEY = 'state'
+
+
+class ZeROCheckpoint(object):
+    def __init__(self, dir):
+        basic_folder_validation(dir)
+        self.dir = dir
+        self.file_list = get_zero_files(dir)
+        self.num_files = len(self.file_list)
+        assert self.num_files > 0, f'No ZeRO files found in {dir}'
+
+        self.src_3d = get_model_3d_descriptor(dir)
+        self.target_3d = model_3d_desc(pp_degree=self.src_3d.pp_degree,
+                                       tp_degree=self.src_3d.tp_degree,
+                                       dp_degree=self.src_3d.dp_degree)
+        self._3d_file_map = self.src_3d.reshape(self.target_3d)
+
+    def get_src_world_size(self):
+        return self.src_3d.world_size()
+
+    def get_src_tp_degree(self):
+        return self.src_3d.tp_degree
+
+    def get_src_pp_degree(self):
+        return self.src_3d.pp_degree
+
+    def get_src_dp_degree(self):
+        return self.src_3d.dp_degree
+
+    def get_file_indices_for_rank(self, pp_index, tp_index, dp_index):
+        assert dp_index < len(self._3d_file_map), f'DP index {dp_index} >= DP degree {len(self._3d_file_map)}'
+        dp_2d_map = self._3d_file_map[dp_index]
+        return dp_2d_map.get_data(pp_index, tp_index)
+
+    def get_files_for_rank(self, pp_index, tp_index, dp_index):
+        file_idx_list = self.get_file_indices_for_rank(pp_index, tp_index, dp_index)
+        return [self.file_list[idx] for idx in file_idx_list]
+
+    def get_state_for_rank(self,
+                           pp_index,
+                           tp_index,
+                           dp_index,
+                           keys_to_ignore=[],
+                           strip_tensor_paddings=True):
+        state_file_list = self.get_files_for_rank(pp_index, tp_index, dp_index)
+        merged_sd = None
+        for state_file in state_file_list:
+            sd = torch.load(state_file, map_location=torch.device('cpu'))
+            for key in keys_to_ignore:
+                sd.pop(key, None)
+
+            if strip_tensor_paddings:
+                self._strip_tensor_paddings(sd)
+
+            if merged_sd is None:
+                merged_sd = sd
+            else:
+                merged_sd = merge_state(merged_sd, sd)
+
+            self._update_partition_count(merged_sd)
+            if strip_tensor_paddings:
+                self._clear_group_paddings(merged_sd)
+
+        return merged_sd
+
+    def print_3d_index_map(self, tag=None):
+        if tag:
+            print(f'3D index map: {tag}')
+        for dp_index, _2d_map in enumerate(self._3d_file_map):
+            _2d_map.print_data(f'dp = {dp_index}')
+
+    def print_3d_file_map(self, tag=None):
+        if tag:
+            print(f'3D file map: {tag}')
+        for dp_index, _2d_map in enumerate(self._3d_file_map):
+            for pp_index in _2d_map.pp_degree:
+                for tp_index in _2d_map.tp_degree:
+                    file_index_list = _2d_map.get_data(pp_index, tp_index)
+                    file_list = [self.file_list[idx] for idx in file_index_list]
+                    print(f'{pp_index}, {tp_index}, {dp_index} => {file_list}')
+
+    def reshape(self, target_3d_desc: model_3d_desc):
+        self.target_3d = target_3d_desc
+        self._3d_file_map = self.src_3d.reshape(self.target_3d)
+
+    def _strip_tensor_paddings(self, sd):
+        param_group_states = self._get_param_group_states(sd)
+        if param_group_states is None:
+            return
+
+        group_paddings = self._get_optimizer_state(sd, GROUP_PADDINGS)
+        if group_paddings is None:
+            return
+
+        for key, group_state in param_group_states.items():
+            if group_paddings[key] == 0:
+                continue
+            for state_name, state_value in group_state.items():
+                if torch.is_tensor(state_value):
+                    raw_length = state_value.numel() - group_paddings[key]
+                    group_state[state_name] = torch.narrow(state_value,
+                                                           0,
+                                                           0,
+                                                           raw_length).clone()
+
+    def _clear_group_paddings(self, sd):
+        group_paddings = self._get_optimizer_state(sd, GROUP_PADDINGS)
+        if group_paddings:
+            num_groups = len(group_paddings)
+            sd[OPTIMIZER_STATE_DICT][GROUP_PADDINGS] = [0] * num_groups
+
+    def _get_optimizer_state(self, sd, state_key):
+        optimizer_state = sd.get(OPTIMIZER_STATE_DICT, None)
+        if optimizer_state is None:
+            return None
+
+        return optimizer_state.get(state_key, None)
+
+    def _get_param_group_states(self, sd):
+        optimizer_state = sd.get(OPTIMIZER_STATE_DICT, None)
+        if optimizer_state is None:
+            return None
+
+        base_optimizer_state = optimizer_state.get(BASE_OPTIMIZER_STATE, None)
+        if base_optimizer_state is None:
+            return None
+
+        return base_optimizer_state.get(GROUP_STATE_KEY, None)
+
+    def _update_partition_count(self, sd):
+        partition_counts = self._get_optimizer_state(sd, PARTITION_COUNT)
+        if partition_counts:
+            num_groups = len(partition_counts)
+            sd[OPTIMIZER_STATE_DICT][PARTITION_COUNT] = [self.target_3d.dp_degree
+                                                         ] * num_groups
diff --git a/deepspeed/comm/__init__.py b/deepspeed/comm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..84d0f54c76a11096bc362d24887ec8c7c7f5f1f9
--- /dev/null
+++ b/deepspeed/comm/__init__.py
@@ -0,0 +1,52 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from .utils import *
+from deepspeed import utils
+
+supported_torch_version = False
+
+# See more details at: https://github.com/pytorch/pytorch/pull/48767
+# The PG API in torch versions lesser than 1.8 are different so it is
+# non-trivial to support both in the same API. We will just use the
+# DS comm. backend in deepspeed/comm/comm.py if torch version if 1.8+.
+
+if older_torch():
+    # Add custom deepspeed torch comm functions here since we can't import deepspeed.comm
+    # NOTE: We can't call torch.distributed directly here. Current hack is to import functions before calling them.
+    supported_torch_version = False
+    from torch.distributed import *
+
+    def get_world_group():
+        return group.WORLD
+
+    def get_global_rank(group, group_rank):
+        if hasattr(torch.distributed.distributed_c10d, "get_global_rank"):
+            from torch.distributed.distributed_c10d import get_global_rank as _get_global_rank
+        else:
+            from torch.distributed.distributed_c10d import _get_global_rank
+        return _get_global_rank(group, group_rank)
+
+    def allgather_fn(output_tensor, input_tensor, group=None, async_op=False):
+        from torch.distributed import all_gather, get_world_size
+        from torch import chunk
+        output_tensors = list(chunk(output_tensor, get_world_size(group)))
+        return all_gather(output_tensors, input_tensor, group=group, async_op=async_op)
+
+    def reduce_scatter_fn(output_tensor, input_tensor, group=None, async_op=False):
+        from torch.distributed import reduce_scatter, get_world_size
+        from torch import chunk
+        input_tensor_lst = list(chunk(input_tensor, get_world_size(group)))
+        return reduce_scatter(output_tensor, input_tensor_lst, group=group)
+
+    def configure(deepspeed_config=None,
+                  enabled=None,
+                  prof_all=None,
+                  prof_ops=None,
+                  verbose=None):
+        utils.logger.warn(
+            "Communication logging is not supported in torch versions older than 1.8")
+
+else:
+    supported_torch_version = True
+    from .comm import *
diff --git a/deepspeed/comm/backend.py b/deepspeed/comm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd436865c5ad79745b4aed4002e6a5db4aa3f5b8
--- /dev/null
+++ b/deepspeed/comm/backend.py
@@ -0,0 +1,43 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+''' DeepSpeed Communication Backend.
+
+In the future, directly use NCCL/MPI/Gloo/etc without requiring torch.distributed. Simply wrap torch.distributed for now.
+
+# Custom DS Backends -- Direct C/Ops
+ - NCCL -- [EXPERIMENTAL]
+ - MPI -- [EXPERIMENTAL]
+ - RCCL -- [EXPERIMENTAL]
+ - GLOO -- [EXPERIMENTAL]
+
+# DS backend wrapper for torch.distributed [DEFAULT]
+ - T-NCCL -- [DEFAULT]
+ - T-GLOO
+ - T-MPI
+'''
+''' Backend is the base class
+     -- NcclBackend, MpiBackend, and TorchBackend are the main subclasses. TorchBackend is the only officially supported backend for now.
+'''
+
+
+class Backend(object):
+    def __init__(self, name='backend', rank=0, size=1):
+        self.name = name
+        # The world size and rank of the world process group
+        self.world_group = None
+        self.world_size = rank
+        self.world_rank = size
+        # Single process group (pg) implementation for now but keep a list for future
+        self.process_groups = []
+        self.initialized = False
+
+    def is_initialized(self):
+        return self.initialized
+
+    def new_group(self):
+        # create a new pg and add it to pg list
+        pass
+
+    def init_process_group(self):
+        # subclasses will initialize them fully
+        # - initialize a default world process group and add it to pg list
+        self.initialized = True
diff --git a/deepspeed/comm/comm.py b/deepspeed/comm/comm.py
new file mode 100644
index 0000000000000000000000000000000000000000..302e3eaf2210a6c399d41571e9b92eb21f0b0fce
--- /dev/null
+++ b/deepspeed/comm/comm.py
@@ -0,0 +1,778 @@
+"""
+    Copyright 2021 The Microsoft DeepSpeed Team
+
+    DeepSpeed Communication Package: deepspeed.comm
+
+    deepspeed.comm
+        -- import and use deepspeeed.ops.comm
+        -- use torch.distributed directly if both this package and torch.distributed use the same NCCL version
+        -- use custom collectives
+            -- can either use torch.dist or ds.ops.comm?
+
+        Note: the old 1-bit compressed allreduce variants that resided in deepspeed.runtime.comm will be moved here as well.
+
+    deepspeed.comm API
+        -- must be kept fully compatible (same signatures) as torch.dist API to ensure backward/cross-framework compatibility.
+        -- e.g. if a client code used
+            from deepspeed import comm as dist
+
+            instead of
+            import torch.distributed as dist
+
+            The code should work without breaking any of the public torch.distributed functionality
+
+    Future:
+        -- deepspeed groups API should be brought into ds.comm
+"""
+
+from enum import Enum
+import torch
+import os
+
+from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT, default_pg_timeout
+from .constants import *
+from deepspeed.accelerator import get_accelerator
+
+
+class ReduceOp(Enum):
+    SUM = 0
+    PRODUCT = 1
+    MIN = 2
+    MAX = 3
+    BAND = 4
+    BOR = 5
+    BXOR = 6
+    AVG = 7
+    UNUSED = 8
+
+
+from deepspeed.utils.comms_logging import CommsLogger
+from deepspeed.utils import timer, get_caller_func
+from deepspeed.comm.torch import TorchBackend
+
+from deepspeed import utils
+from datetime import timedelta
+
+# Current deepspeed.comm backend (cdb) global object for simple access by client code
+use_ds_backend = False
+cdb = None
+
+# Create global timer for ops
+timers = timer.SynchronizedWallClockTimer()
+timer_summary = {}
+
+comms_logger = CommsLogger()
+
+# Ensure we don't warn about base collectives more than once
+has_warned_all_gather = False
+has_warned_reduce_scatter = False
+
+# Maintain objects of all initialized ds backends and assign them using the API functions in this file
+nccl_backend = None
+mpi_backend = None
+
+# This should be set here so all rank/size information from the launcher can be propagated
+from deepspeed.comm.utils import *
+
+
+def _configure_using_config_file(config):
+    if config.comms_logger_enabled:
+        comms_logger.configure(config)
+
+
+def configure(
+    deepspeed_config=None,
+    enabled=None,
+    prof_all=None,
+    prof_ops=None,
+    verbose=None,
+    debug=None,
+):
+
+    if deepspeed_config is not None:
+        _configure_using_config_file(deepspeed_config.comms_config)
+
+    if enabled is not None:
+        comms_logger.enabled = enabled
+
+    if prof_all is not None:
+        comms_logger.prof_all = prof_all
+
+    if prof_ops is not None:
+        comms_logger.prof_ops = prof_ops
+
+    if verbose is not None:
+        comms_logger.verbose = verbose
+
+    if debug is not None:
+        comms_logger.debug = debug
+
+
+# Logging wrapper for timing ops
+def timed_op(func):
+    def log_wrapper(*args, **kwargs):
+        # Add enabled flag so that overhead to each comm op is two if conditions at most
+        if comms_logger.enabled:
+            if ('prof' in kwargs and kwargs['prof']) or comms_logger.prof_all or (
+                    'log_name' in kwargs
+                    and kwargs['log_name'] in comms_logger.prof_ops):
+                # Need func args for their defaults
+                func_args = get_default_args(func)
+                func_args.update(kwargs)
+                msg_size = get_msg_size_from_args(func, *args, **kwargs)
+                log_name = get_debug_log_name(func_args, comms_logger.debug)
+                timers(log_name).start()
+        # Return the op, then stop the op's timer
+        try:
+            return func(*args, **kwargs)
+        finally:
+            if comms_logger.enabled:
+                # Need to make op blocking for accurate logging
+                get_accelerator().synchronize()
+                # If we're using MPI, we can't simply sync the stream
+                if cdb.using_mpi:
+                    cdb.barrier()
+                if ('prof' in kwargs and kwargs['prof']) or comms_logger.prof_all or (
+                        'log_name' in kwargs
+                        and kwargs['log_name'] in comms_logger.prof_ops):
+                    log_name = get_debug_log_name(func_args, comms_logger.debug)
+                    raw_name = func.__name__
+                    timers(log_name).stop()
+                    # need temp var since 'elapsed' resets events
+                    time_elapsed = timers(log_name).elapsed(reset=False)
+                    comms_logger.append(raw_name, log_name, time_elapsed, msg_size)
+
+    return log_wrapper
+
+
+# For compatibility with torch distributed's init_process_group, we shall retain the signature from PyTorch code.
+# DeepSpeed NCCL/MPI backend may not need all these params as we will have our own implementation.
+# Please read full torch.distributed API docs from https://pytorch.org/docs/stable/distributed.html
+
+
+# UNUSED: Future helper function to initialize DS backends
+def init_deepspeed_backend(ds_backend):
+    global cdb
+    global nccl_backend
+    global mpi_backend
+    global use_ds_backend
+
+    if ds_backend == NCCL_BACKEND:
+        utils.logger.warn("NCCL backend in DeepSpeed not yet implemented")
+    elif ds_backend == MPI_BACKEND:
+        utils.logger.warn("MPI backend in DeepSpeed not yet implemented")
+    elif ds_backend == GLOO_BACKEND:
+        utils.logger.warn("Gloo backend in DeepSpeed not yet implemented")
+    else:
+        utils.logger.warn(f"DeepSpeed does not support {ds_backend} backend")
+
+
+def is_initialized():
+    #assert cdb is not None, 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    if cdb is None:
+        return False
+    else:
+        return cdb.is_initialized()
+
+
+def destroy_process_group(group=None):
+    global cdb
+    return cdb.destroy_process_group(group=group)
+
+
+def new_group(ranks):
+    global cdb
+    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    return cdb.new_group(ranks)
+
+
+def is_available() -> bool:
+
+    # Returns ``True`` if the deepspeed comm package is available.
+
+    # TODO: load other ops. Clients including deepspeed itself should use deepspeed.comm to import
+    # any communication related primitives from this package.
+    # use hasattr(deepspeed.csrc.ops, "_comm") or something
+    return True
+
+
+def set_backend(backend):
+    if not use_ds_backend:
+        utils.logger.error(
+            "DeepSpeed communication backend is required. Please use deepspeed.comm.init_distributed(backend, use_deepspeed=True) to use this functionality"
+        )
+        raise RuntimeError(
+            'Error: Custom DeepSpeed backend called without initializing DeepSpeed distributed.'
+        )
+
+    global cdb
+    global nccl_backend
+    global mpi_backend
+
+    try:
+        if backend_name == NCCL_BACKEND:
+            if nccl_backend is not None and nccl_backend.is_initialized():
+                cdb = nccl_backend
+        elif backend_name == MPI_BACKEND:
+            if mpi_backend is not None and mpi_backend.is_initialized():
+                cdb = mpi_backend
+    except Exception as inst:
+        print(inst)
+
+
+@timed_op
+def broadcast(tensor,
+              src,
+              group=None,
+              async_op=False,
+              prof=False,
+              log_name='broadcast',
+              debug=get_caller_func()):
+    global cdb
+    return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
+
+
+@timed_op
+def all_gather(tensor_list,
+               tensor,
+               group=None,
+               async_op=False,
+               prof=False,
+               log_name='all_gather',
+               debug=get_caller_func()):
+    global cdb
+    return cdb.all_gather(tensor_list=tensor_list,
+                          tensor=tensor,
+                          group=group,
+                          async_op=async_op)
+
+
+def has_reduce_scatter_base():
+    global cdb
+    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    assert cdb.has_reduce_scatter_base is not None, 'has_reduce_scatter_base is not yet defined'
+    return cdb.has_reduce_scatter_base
+
+
+def reduce_scatter_fn(output_tensor,
+                      tensor,
+                      op=ReduceOp.SUM,
+                      group=None,
+                      async_op=False,
+                      prof=False,
+                      debug=get_caller_func()):
+    global cdb
+    global has_warned_reduce_scatter
+    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    if cdb.has_reduce_scatter_base:
+        return reduce_scatter_base(output_tensor,
+                                   tensor,
+                                   op=op,
+                                   group=group,
+                                   async_op=async_op,
+                                   prof=prof,
+                                   debug=debug)
+    else:
+        if not has_warned_reduce_scatter:
+            utils.logger.warning(
+                "unable to find torch.distributed._reduce_scatter_base. will fall back to "
+                "torch.distributed.all_gather which will result in suboptimal performance. "
+                "please consider upgrading your pytorch installation.")
+            has_warned_reduce_scatter = True
+        input_tensor_lst = list(torch.chunk(tensor, cdb.get_world_size(group)))
+        return reduce_scatter(output_tensor,
+                              input_tensor_lst,
+                              op=op,
+                              group=group,
+                              async_op=async_op,
+                              prof=prof,
+                              debug=debug)
+
+
+@timed_op
+def reduce_scatter_base(output_tensor,
+                        tensor,
+                        op=ReduceOp.SUM,
+                        group=None,
+                        async_op=False,
+                        prof=False,
+                        log_name='reduce_scatter_base',
+                        debug=get_caller_func()):
+    global cdb
+    return cdb.reduce_scatter_base(output_tensor=output_tensor,
+                                   input_tensor=tensor,
+                                   op=op,
+                                   group=group,
+                                   async_op=async_op)
+
+
+@timed_op
+def all_gather_base(output_tensor,
+                    tensor,
+                    group=None,
+                    async_op=False,
+                    prof=False,
+                    log_name='all_gather_base',
+                    debug=get_caller_func()):
+    global cdb
+    return cdb.all_gather_base(output_tensor=output_tensor,
+                               input_tensor=tensor,
+                               group=group,
+                               async_op=async_op)
+
+
+def has_allgather_base():
+    global cdb
+    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    assert cdb.has_allgather_base is not None, 'has_allgather_base is not yet defined'
+    return cdb.has_allgather_base
+
+
+def allgather_fn(output_tensor,
+                 input_tensor,
+                 group=None,
+                 async_op=False,
+                 debug=get_caller_func()):
+    global cdb
+    global has_warned_all_gather
+    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    if cdb.has_allgather_base:
+        return all_gather_base(output_tensor,
+                               input_tensor,
+                               group=group,
+                               async_op=async_op,
+                               debug=debug)
+    else:
+        if not has_warned_all_gather and get_rank() == 0:
+            utils.logger.warning(
+                "unable to find torch.distributed._all_gather_base. will fall back to "
+                "torch.distributed.all_gather which will result in suboptimal performance. "
+                "please consider upgrading your pytorch installation.")
+            has_warned_all_gather = True
+        output_tensors = list(torch.chunk(output_tensor, cdb.get_world_size(group)))
+        return all_gather(output_tensors,
+                          input_tensor,
+                          group=group,
+                          async_op=async_op,
+                          debug=debug)
+
+
+@timed_op
+def all_to_all_single(output,
+                      tensor,
+                      output_split_sizes=None,
+                      input_split_sizes=None,
+                      group=None,
+                      async_op=False,
+                      prof=False,
+                      log_name='all_to_all_single',
+                      debug=get_caller_func()):
+    global cdb
+    return cdb.all_to_all_single(output=output,
+                                 input=tensor,
+                                 output_split_sizes=output_split_sizes,
+                                 input_split_sizes=input_split_sizes,
+                                 group=group,
+                                 async_op=async_op)
+
+
+@timed_op
+def send(tensor,
+         dst,
+         group=None,
+         tag=0,
+         prof=False,
+         log_name='send',
+         debug=get_caller_func()):
+    global cdb
+    return cdb.send(tensor=tensor, dst=dst, group=group, tag=tag)
+
+
+@timed_op
+def recv(tensor,
+         src=None,
+         group=None,
+         tag=0,
+         prof=False,
+         log_name='recv',
+         debug=get_caller_func()):
+    global cdb
+    return cdb.recv(tensor=tensor, src=src, group=group, tag=tag)
+
+
+@timed_op
+def isend(tensor,
+          dst,
+          group=None,
+          tag=0,
+          prof=False,
+          log_name='isend',
+          debug=get_caller_func()):
+    global cdb
+    return cdb.send(tensor=tensor, dst=dst, group=group, tag=tag)
+
+
+@timed_op
+def irecv(tensor,
+          src=None,
+          group=None,
+          tag=0,
+          prof=False,
+          log_name='irecv',
+          debug=get_caller_func()):
+    global cdb
+    return cdb.recv(tensor=tensor, src=src, group=group, tag=tag)
+
+
+@timed_op
+def gather(tensor,
+           gather_list=None,
+           dst=0,
+           group=None,
+           async_op=False,
+           prof=False,
+           log_name='gather',
+           debug=get_caller_func()):
+    global cdb
+    return cdb.gather(tensor=tensor,
+                      gather_list=gather_list,
+                      dst=dst,
+                      group=group,
+                      async_op=async_op)
+
+
+@timed_op
+def scatter(tensor,
+            scatter_list=None,
+            src=0,
+            group=None,
+            async_op=False,
+            prof=False,
+            log_name='scatter',
+            debug=get_caller_func()):
+    global cdb
+    return cdb.scatter(tensor=tensor,
+                       scatter_list=scatter_list,
+                       src=src,
+                       group=group,
+                       async_op=async_op)
+
+
+@timed_op
+def barrier(group=None,
+            async_op=False,
+            device_ids=None,
+            prof=False,
+            log_name='barrier',
+            debug=get_caller_func()):
+    global cdb
+    return cdb.barrier(group=group, async_op=async_op, device_ids=device_ids)
+
+
+@timed_op
+def monitored_barrier(group=None,
+                      timeout=None,
+                      wait_all_ranks=False,
+                      prof=False,
+                      log_name='monitored_barrier',
+                      debug=get_caller_func()):
+    global cdb
+    return cdb.barrier(group=group, timeout=timeout, wait_all_ranks=wait_all_ranks)
+
+
+def log_summary():
+    global cdb
+    barrier(log_name='log_summary_barrier')
+    if cdb.get_rank() == 0:
+        comms_logger.log_all()
+    barrier(log_name='log_summary_barrier')
+
+
+@timed_op
+def reduce(tensor,
+           dst,
+           op=ReduceOp.SUM,
+           group=None,
+           async_op=False,
+           prof=False,
+           log_name='reduce',
+           debug=get_caller_func()):
+    global cdb
+    return cdb.reduce(tensor=tensor, dst=dst, op=op, group=group, async_op=async_op)
+
+
+@timed_op
+def reduce_scatter(output,
+                   input_list,
+                   op=ReduceOp.SUM,
+                   group=None,
+                   async_op=False,
+                   prof=False,
+                   log_name='reduce_scatter',
+                   debug=get_caller_func()):
+    global cdb
+    return cdb.reduce_scatter(output=output,
+                              input_list=input_list,
+                              op=op,
+                              group=group,
+                              async_op=async_op)
+
+
+@timed_op
+def all_reduce(tensor,
+               op=ReduceOp.SUM,
+               group=None,
+               async_op=False,
+               prof=False,
+               log_name='all_reduce',
+               debug=get_caller_func()):
+    #if profile_comm:
+    # context of the timers?
+    # timers.start()
+    # TensorBoard logging for comm calls.?
+    global cdb
+    #print(f'op = {op}, cdb= {cdb.name}')
+    return cdb.all_reduce(tensor, op, group, async_op)
+
+
+def get_world_group():
+    global cdb
+    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    return cdb.get_world_group()
+
+
+def get_world_size(group=None) -> int:
+    """
+    Returns the number of processes in the current process group
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+    Returns:
+        The world size of the process group
+        -1, if not part of the group
+    """
+    global cdb
+
+    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    return cdb.get_world_size(group)
+
+
+def get_rank(group=None):
+    """
+    Returns the rank of the current process in the provided ``group`` or the
+    default group if none was provided.
+    Rank is a unique identifier assigned to each process within a distributed
+    process group. They are always consecutive integers ranging from 0 to
+    ``world_size``.
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+    Returns:
+        The rank of the process group
+        -1, if not part of the group
+    """
+    global cdb
+    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    return cdb.get_rank(group)
+
+
+def get_local_rank():
+    """
+        Helper function to get local rank after a backend has been set and initialized
+        Args:
+            None
+        Returns:
+            local rank (= GPU device ID)
+    """
+    global cdb
+    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    return get_local_rank_from_launcher()
+
+
+def get_global_rank(group=None, group_rank=0):
+    global cdb
+    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    return cdb.get_global_rank(group, group_rank)
+
+
+# Main DeepSpeed Comms. public API.
+def init_distributed(dist_backend=None,
+                     auto_mpi_discovery=True,
+                     distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT,
+                     verbose=True,
+                     timeout=default_pg_timeout,
+                     init_method=None,
+                     dist_init_required=None,
+                     config=None,
+                     rank=-1,
+                     world_size=-1):
+    ''' Initialize dist backend, potentially performing MPI discovery if needed
+
+    Arguments:
+        dist_backend: Optional (str). torch distributed backend, e.g., nccl, mpi, gloo
+        auto_mpi_discovery Optional (bool). if distributed environment variables are not set, attempt to discover them from MPI
+        distributed_port: Optional (int). torch distributed backend port
+        verbose: Optional (bool). verbose logging
+        timeout: Optional (timedelta). Timeout for operations executed against the process group. Default value equals 30 minutes.
+        init_method: Optional (string). Torch distributed, URL specifying how to initialize the process group. Default is “env://” if no init_method or store is specified.
+        config: Optional (dict). DeepSpeed configuration for setting up comms options (e.g. Comms profiling)
+        rank: Optional (int). The current manually specified rank. Some init_method like “tcp://” need the rank and world_size as well (see: https://pytorch.org/docs/stable/distributed.html#tcp-initialization)
+        world_size: Optional (int). Desired world_size for the TCP or Shared file-system initialization.
+    '''
+    global cdb
+
+    configure(deepspeed_config=config)
+
+    if dist_init_required is None:
+        dist_init_required = cdb is None or not cdb.is_initialized()
+
+    if cdb is None and torch.distributed.is_initialized():
+        # The user initialized torch.dist themselves, create cdb and short-circuit
+        cdb = TorchBackend(dist_backend, timeout, init_method)
+        return
+
+    if dist_init_required is False:
+        assert (
+            cdb is not None and cdb.is_initialized() is True
+        ), "Distributed backend is not initialized. Please set dist_init_required to True or initialize before calling deepspeed.initialize()"
+    else:
+        # Initialize torch distributed if needed
+        required_env = ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
+        if auto_mpi_discovery and not all(map(lambda v: v in os.environ, required_env)):
+            if verbose:
+                utils.logger.info(
+                    "Not using the DeepSpeed or dist launchers, attempting to detect MPI environment..."
+                )
+            if in_aml() and not in_dlts():
+                patch_aml_env_for_torch_nccl_backend(verbose=verbose)
+            elif in_aws_sm():
+                patch_aws_sm_env_for_torch_nccl_backend(verbose=verbose)
+            else:
+                mpi_discovery(distributed_port=distributed_port, verbose=verbose)
+
+        if cdb is not None and cdb.is_initialized():
+            if int(os.getenv('RANK', '0')) == 0:
+                utils.logger.info('Distributed backend already initialized')
+        else:
+            assert isinstance(timeout, timedelta)
+            if dist_backend == None:
+                dist_backend = get_accelerator().communication_backend_name()
+            if int(os.getenv('RANK', '0')) == 0:
+                utils.logger.info(
+                    'Initializing TorchBackend in DeepSpeed with backend {}'.format(
+                        dist_backend))
+            # Create a torch backend object, initialize torch distributed, and assign to cdb
+            cdb = TorchBackend(dist_backend, timeout, init_method, rank, world_size)
+
+
+def mpi_discovery(distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT, verbose=True):
+    '''
+    Discovery MPI environment via mpi4py and map to relevant dist state
+    '''
+    from mpi4py import MPI
+    import subprocess
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    world_size = comm.Get_size()
+
+    master_addr = None
+    if rank == 0:
+        hostname_cmd = ["hostname -I"]
+        result = subprocess.check_output(hostname_cmd, shell=True)
+        master_addr = result.decode('utf-8').split()[0]
+    master_addr = comm.bcast(master_addr, root=0)
+
+    # Determine local rank by assuming hostnames are unique
+    proc_name = MPI.Get_processor_name()
+    all_procs = comm.allgather(proc_name)
+    local_rank = sum([i == proc_name for i in all_procs[:rank]])
+
+    os.environ['RANK'] = str(rank)
+    os.environ['WORLD_SIZE'] = str(world_size)
+    os.environ['LOCAL_RANK'] = str(local_rank)
+    os.environ['MASTER_ADDR'] = master_addr
+    os.environ['MASTER_PORT'] = str(distributed_port)
+
+    if verbose:
+        utils.logger.info(
+            "Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
+            .format(os.environ['RANK'],
+                    os.environ['LOCAL_RANK'],
+                    os.environ['WORLD_SIZE'],
+                    os.environ['MASTER_ADDR'],
+                    os.environ['MASTER_PORT']))
+
+    if cdb is not None and cdb.is_initialized():
+        assert cdb.get_rank() == rank, "MPI rank {} does not match torch rank {}".format(
+            rank, cdb.get_rank())
+        assert cdb.get_world_size() == world_size, "MPI world size {} does not match torch world size {}".format(
+            world_size, cdb.get_world_size())
+
+
+def in_aml():
+    # Are we running inside an Azure Machine Learning (AML) environment?
+    return 'AZUREML_EXPERIMENT_ID' in os.environ
+
+
+def in_aws_sm():
+    # Are we running inside an AWS SageMaker environment?
+    return 'SM_TRAINING_ENV' in os.environ
+
+
+def in_dlts():
+    # Are we running on a DLTS cluster?
+    return 'DLTS_JOB_ID' in os.environ
+
+
+def patch_aml_env_for_torch_nccl_backend(master_port=6105, verbose=True):
+    """Helper routine to get and set environment variables.
+    This is adapted from Azure ML's documentation available from:
+    https://azure.github.io/azureml-web/docs/cheatsheet/distributed-training/#environment-variables-from-openmpi
+    """
+    os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
+    os.environ["WORLD_SIZE"] = os.environ["OMPI_COMM_WORLD_SIZE"]
+    single_node = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"]) == int(
+        os.environ["WORLD_SIZE"])
+
+    if not single_node:
+        master_node_params = os.environ["AZ_BATCH_MASTER_NODE"].split(":")
+        os.environ["MASTER_ADDR"] = master_node_params[0]
+        # Do not overwrite master port with that defined in AZ_BATCH_MASTER_NODE
+        if "MASTER_PORT" not in os.environ:
+            os.environ["MASTER_PORT"] = str(master_port)
+    else:
+        os.environ["MASTER_ADDR"] = os.environ["AZ_BATCHAI_MPI_MASTER_NODE"]
+        os.environ["MASTER_PORT"] = DEFAULT_AML_MASTER_PORT
+
+    if verbose:
+        utils.logger.info("NCCL_SOCKET_IFNAME original value = {}".format(
+            os.environ["NCCL_SOCKET_IFNAME"]))
+
+    os.environ["NCCL_SOCKET_IFNAME"] = DEFAULT_AML_NCCL_SOCKET_IFNAME
+    os.environ['LOCAL_RANK'] = os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]
+
+    if verbose:
+        utils.logger.info(
+            "Discovered AzureML settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
+            .format(os.environ['RANK'],
+                    os.environ['LOCAL_RANK'],
+                    os.environ['WORLD_SIZE'],
+                    os.environ['MASTER_ADDR'],
+                    os.environ['MASTER_PORT']))
+
+
+def patch_aws_sm_env_for_torch_nccl_backend(verbose=True):
+    """Helper routine to get and set environment variables when running inside an AWS SageMaker environment.
+    """
+    os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
+    os.environ['LOCAL_RANK'] = os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]
+    os.environ["WORLD_SIZE"] = os.environ["OMPI_COMM_WORLD_SIZE"]
+
+    if verbose:
+        utils.logger.info(
+            "Discovered AWS SageMaker settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
+            .format(os.environ['RANK'],
+                    os.environ['LOCAL_RANK'],
+                    os.environ['WORLD_SIZE'],
+                    os.environ['MASTER_ADDR'],
+                    os.environ['MASTER_PORT']))
diff --git a/deepspeed/comm/config.py b/deepspeed/comm/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..25839514fcfca9cd86c78600a0b1329edbf886d7
--- /dev/null
+++ b/deepspeed/comm/config.py
@@ -0,0 +1,32 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+"""
+Copyright (c) Microsoft Corporation
+Licensed under the MIT license.
+"""
+
+from pydantic import BaseModel
+from .constants import *
+
+
+class CommsConfig(BaseModel):
+    class Config:
+        validate_all = True
+        validate_assignment = True
+        use_enum_values = True
+        extra = 'forbid'
+
+
+class CommsLoggerConfig(CommsConfig):
+    enabled: bool = COMMS_LOGGER_ENABLED_DEFAULT
+    prof_all: bool = COMMS_LOGGER_PROF_ALL_DEFAULT
+    prof_ops: list = COMMS_LOGGER_PROF_OPS_DEFAULT
+    verbose: bool = COMMS_LOGGER_VERBOSE_DEFAULT
+    debug: bool = COMMS_LOGGER_DEBUG_DEFAULT
+
+
+class DeepSpeedCommsConfig:
+    def __init__(self, ds_config):
+        self.comms_logger_enabled = 'comms_logger' in ds_config
+
+        if self.comms_logger_enabled:
+            self.comms_logger = CommsLoggerConfig(**ds_config['comms_logger'])
diff --git a/deepspeed/comm/constants.py b/deepspeed/comm/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..45fd5ac7e3ed867a4504e425bb1322b1aaebd8c6
--- /dev/null
+++ b/deepspeed/comm/constants.py
@@ -0,0 +1,46 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+NCCL_BACKEND = 'nccl'
+MPI_BACKEND = 'mpi'
+GLOO_BACKEND = 'gloo'
+SCCL_BACKEND = 'sccl'
+
+DEFAULT_AML_MASTER_PORT = "54965"
+DEFAULT_AML_NCCL_SOCKET_IFNAME = "^docker0,lo"
+
+#########################################
+# Comms Logger
+#########################################
+# Comms Logger. By default, this feature is not enabled.
+# Users can configure in ds_config.json as below example:
+COMMS_LOGGER_FORMAT = '''
+The Comms Logger can be specified as:
+"comms_logger": {
+  "enabled": true,
+  "verbose": false,
+  "prof_all": true,
+  "debug": false,
+  "prof_ops": ["all_reduce", "custom_all_reduce_name"]
+}
+'''
+COMMS_LOGGER = "comms_logger"
+
+# Comms logger enable signal
+COMMS_LOGGER_ENABLED = "enabled"
+COMMS_LOGGER_ENABLED_DEFAULT = False
+
+# Comms logger verbose signal
+COMMS_LOGGER_VERBOSE = "verbose"
+COMMS_LOGGER_VERBOSE_DEFAULT = False
+
+# comms logger profile all ops signal
+COMMS_LOGGER_PROF_ALL = "prof_all"
+COMMS_LOGGER_PROF_ALL_DEFAULT = True
+
+# comms logger show all ops signal
+COMMS_LOGGER_DEBUG = "debug"
+COMMS_LOGGER_DEBUG_DEFAULT = False
+
+# comms logger profile specific ops in list
+COMMS_LOGGER_PROF_OPS = "prof_ops"
+COMMS_LOGGER_PROF_OPS_DEFAULT = []
diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..0753e7ce4bb6b63192f4adc9c89c4beb0e71e15e
--- /dev/null
+++ b/deepspeed/comm/torch.py
@@ -0,0 +1,237 @@
+'''
+Copyright 2021 The Microsoft DeepSpeed Team
+'''
+from deepspeed import utils
+
+from .utils import *
+from .backend import *
+from .comm import *
+
+
+class TorchBackend(Backend):
+    """
+        A light-weight wrapper class for torch.distributed API.
+        Only a subset of functions are wrapped. Once the init_process_group
+        is initialized, standard torch.distributed.* can be used directly
+        so no need to wrap all the functions. We can keep adding wrappers as
+        needed.
+    """
+    def __init__(self,
+                 backend,
+                 timeout,
+                 init_method,
+                 rank=-1,
+                 world_size=-1,
+                 name='torch'):
+        super(TorchBackend, self).__init__()
+        self.torch_version_before_18 = older_torch()
+        self.has_allgather_base = has_allgather_base()
+        self.has_reduce_scatter_base = has_reduce_scatter_base()
+        self.initialized = True
+        self.name = name
+        # Future functionality to support ds.initialize() on a single GPU
+        # The idea is to fake that dist backend is initialized even when
+        # it is not so we can run on a single GPU without doing any init_process_group
+        self.single_gpu_mode = True
+        self.init_process_group(backend, timeout, init_method, rank, world_size)
+
+    def init_process_group(self, backend, timeout, init_method, rank, world_size):
+        if not torch.distributed.is_initialized():
+            torch.distributed.init_process_group(backend,
+                                                 timeout=timeout,
+                                                 init_method=init_method,
+                                                 rank=rank,
+                                                 world_size=world_size)
+        self.using_mpi = torch.distributed.get_backend() == 'mpi'
+
+    def all_reduce(self,
+                   tensor,
+                   op=torch.distributed.ReduceOp.SUM,
+                   group=None,
+                   async_op=False):
+        op = self._reduce_op(op)
+        return torch.distributed.all_reduce(tensor=tensor,
+                                            op=op,
+                                            group=group,
+                                            async_op=async_op)
+
+    def reduce(self, tensor, dst, op=ReduceOp.SUM, group=None, async_op=False):
+        return torch.distributed.reduce(tensor=tensor,
+                                        dst=dst,
+                                        op=self._reduce_op(op),
+                                        group=group,
+                                        async_op=async_op)
+
+    def reduce_scatter(self,
+                       output,
+                       input_list,
+                       op=ReduceOp.SUM,
+                       group=None,
+                       async_op=False):
+        return torch.distributed.reduce_scatter(output=output,
+                                                input_list=input_list,
+                                                op=self._reduce_op(op),
+                                                group=group,
+                                                async_op=async_op)
+
+    def broadcast(self, tensor, src, group=None, async_op=False):
+        return torch.distributed.broadcast(tensor=tensor,
+                                           src=src,
+                                           group=group,
+                                           async_op=async_op)
+
+    def all_gather(self, tensor_list, tensor, group=None, async_op=False):
+        return torch.distributed.all_gather(tensor_list=tensor_list,
+                                            tensor=tensor,
+                                            group=group,
+                                            async_op=async_op)
+
+    def all_gather_base(self, output_tensor, input_tensor, group=None, async_op=False):
+        if self.has_allgather_base:
+            return torch.distributed.distributed_c10d._all_gather_base(
+                output_tensor=output_tensor,
+                input_tensor=input_tensor,
+                group=group,
+                async_op=async_op)
+        else:
+            utils.logger.warning(
+                "unable to find torch.distributed._all_gather_base. will fall back to "
+                "torch.distributed.reduce_scatter which will result in suboptimal performance. "
+                "please consider upgrading your pytorch installation.")
+            pass
+
+    def reduce_scatter_base(self,
+                            output_tensor,
+                            input_tensor,
+                            op=ReduceOp.SUM,
+                            group=None,
+                            async_op=False):
+        if self.has_reduce_scatter_base:
+            return torch.distributed._reduce_scatter_base(output_tensor,
+                                                          input_tensor,
+                                                          op=self._reduce_op(op),
+                                                          group=group,
+                                                          async_op=async_op)
+        else:
+            utils.logger.warning(
+                "unable to find torch.distributed._reduce_scatter_base. will fall back to "
+                "torch.distributed.reduce_scatter which will result in suboptimal performance. "
+                "please consider upgrading your pytorch installation.")
+            pass
+
+    def all_to_all_single(self,
+                          output,
+                          input,
+                          output_split_sizes=None,
+                          input_split_sizes=None,
+                          group=None,
+                          async_op=False):
+        return torch.distributed.all_to_all_single(output=output,
+                                                   input=input,
+                                                   output_split_sizes=output_split_sizes,
+                                                   input_split_sizes=input_split_sizes,
+                                                   group=group,
+                                                   async_op=async_op)
+
+    def send(self, tensor, dst, group=None, tag=0):
+        return torch.distributed.send(tensor=tensor, dst=dst, group=group, tag=tag)
+
+    def recv(self, tensor, src=None, group=None, tag=0):
+        return torch.distributed.recv(tensor=tensor, src=src, group=group, tag=tag)
+
+    def isend(self, tensor, dst, group=None, tag=0):
+        return torch.distributed.isend(tensor=tensor, dst=dst, group=group, tag=tag)
+
+    def irecv(self, tensor, src=None, group=None, tag=0):
+        return torch.distributed.irecv(tensor=tensor, src=src, group=group, tag=tag)
+
+    def gather(self, tensor, gather_list=None, dst=0, group=None, async_op=False):
+        return torch.distributed.gather(tensor=tensor,
+                                        gather_list=gather_list,
+                                        dst=dst,
+                                        group=group,
+                                        async_op=async_op)
+
+    def scatter(self, tensor, scatter_list=None, src=0, group=None, async_op=False):
+        return torch.distributed.scatter(tensor=tensor,
+                                         scatter_list=scatter_list,
+                                         src=src,
+                                         group=group,
+                                         async_op=async_op)
+
+    def barrier(self,
+                group=torch.distributed.GroupMember.WORLD,
+                async_op=False,
+                device_ids=None):
+        if group is None:
+            group = torch.distributed.GroupMember.WORLD
+        return torch.distributed.barrier(group=group,
+                                         async_op=async_op,
+                                         device_ids=device_ids)
+
+    def monitored_barrier(self,
+                          group=torch.distributed.GroupMember.WORLD,
+                          timeout=None,
+                          wait_all_ranks=False):
+        if group is None:
+            group = torch.distributed.GroupMember.WORLD
+        return torch.distributed.monitored_barrier(group=group,
+                                                   timeout=timeout,
+                                                   wait_all_ranks=wait_all_ranks)
+
+    def get_rank(self, group=None):
+        return torch.distributed.get_rank(group=group)
+
+    def get_world_size(self, group=None):
+        return torch.distributed.get_world_size(group=group)
+
+    def is_initialized(self):
+        return torch.distributed.is_initialized()
+
+    def get_backend(self, group=None):
+        return torch.distributed.get_backend(group=group)
+
+    def new_group(self, ranks):
+        return torch.distributed.new_group(ranks)
+
+    def get_global_rank(self, group, group_rank):
+        if hasattr(torch.distributed.distributed_c10d, "get_global_rank"):
+            from torch.distributed.distributed_c10d import get_global_rank as _get_global_rank
+        else:
+            from torch.distributed.distributed_c10d import _get_global_rank
+        return _get_global_rank(group, group_rank)
+
+    def get_world_group(self):
+        return torch.distributed.group.WORLD
+
+    def destroy_process_group(self, group=None):
+        return torch.distributed.destroy_process_group(group=group)
+
+    def _reduce_op(self, op):
+        '''
+            Helper function. If the op provided is not a torch.dist.ReduceOp, convert it and return
+        '''
+        if not isinstance(op, torch.distributed.ReduceOp):
+            if op == ReduceOp.SUM:
+                op = torch.distributed.ReduceOp.SUM
+            elif op == ReduceOp.PRODUCT:
+                op = torch.distributed.ReduceOp.PRODUCT
+            elif op == ReduceOp.AVG:
+                op = torch.distributed.ReduceOp.AVG
+            elif op == ReduceOp.MIN:
+                op = torch.distributed.ReduceOp.MIN
+            elif op == ReduceOp.MAX:
+                op = torch.distributed.ReduceOp.MAX
+            elif op == ReduceOp.BAND:
+                op = torch.distributed.ReduceOp.BAND
+            elif op == ReduceOp.BOR:
+                op = torch.distributed.ReduceOp.BOR
+            elif op == ReduceOp.BXOR:
+                op = torch.distributed.ReduceOp.BXOR
+        return op
+
+
+# This will become a light-weight wrapper around torch.distributed functions
+# TODO: create some example to show how this wrapper can help profile communication
+# TODO: make sure there is no performance regression with this approach
+# TODO: explore monkey-patching if this does not work
diff --git a/deepspeed/comm/utils.py b/deepspeed/comm/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..120838e72eedbe4857e648d8ed7ae1e96a970e52
--- /dev/null
+++ b/deepspeed/comm/utils.py
@@ -0,0 +1,158 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
+import torch
+import inspect
+from deepspeed.utils import get_caller_func
+
+
+def older_torch():
+    '''
+        Helper to lookup torch version. For versions less than 1.8, torch.dist
+        used torch.distributed.group.WORLD as the default group argument instead of None.
+        See more details at: https://github.com/pytorch/pytorch/pull/48767
+    '''
+    TORCH_MAJOR = int(torch.__version__.split('.')[0])
+    TORCH_MINOR = int(torch.__version__.split('.')[1])
+    if TORCH_MAJOR == 1 and TORCH_MINOR < 8:
+        return True
+    else:
+        return False
+
+
+def has_allgather_base():
+    '''
+        Helper to check if torch.distributed has _all_gather_base
+    '''
+    return hasattr(torch.distributed, "_all_gather_base")
+
+
+def has_reduce_scatter_base():
+    '''
+        Helper to check if torch.distributed has _reduce_scatter_base
+    '''
+    return hasattr(torch.distributed, "_reduce_scatter_base")
+
+
+def get_local_rank_from_launcher():
+
+    # DeepSpeed launcher will set it so get from there
+    rank = os.environ.get('LOCAL_RANK')
+
+    if rank is None:
+        rank = os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK')
+
+    # Make it a single process job and set rank to 0
+    if rank is None:
+        rank = 0
+
+    return int(rank)
+
+
+def get_world_rank_from_launcher():
+
+    # DeepSpeed launcher will set it so get from there
+    rank = os.environ.get('RANK')
+
+    if rank is None:
+        rank = os.environ.get('OMPI_COMM_WORLD_RANK')
+
+    # Make it a single process job and set rank to 0
+    if rank is None:
+        rank = 0
+
+    return int(rank)
+
+
+def get_world_size_from_launcher():
+    # DeepSpeed launcher will set it so get from there
+    size = os.environ.get('WORLD_SIZE')
+    rank = os.environ.get('RANK')
+
+    if size is None:
+        size = os.environ.get('OMPI_COMM_WORLD_SIZE')
+
+    # Make it a single process job and set size to 1
+    if size is None:
+        size = 1
+
+    if rank == 0:
+        print(f"set world size to {size}")
+
+    return int(size)
+
+
+def get_default_args(func):
+    signature = inspect.signature(func)
+    return {
+        k: v.default
+        for k,
+        v in signature.parameters.items() if v.default is not inspect.Parameter.empty
+    }
+
+
+# We need this hacky function since torch doesn't consistently name or place the input tensor args
+def get_tensor_position(func):
+    sig_params = inspect.signature(func).parameters
+    arg = None
+    # most colls
+    if 'tensor' in sig_params:
+        arg = 'tensor'
+    # reduce scatter coll
+    elif 'input_list' in sig_params:
+        arg = 'input_list'
+    # all_to_all and torch multiGPU colls
+    elif 'input_tensor_list' in sig_params:
+        arg = 'input_tensor_list'
+    if arg is None:
+        return -1
+    else:
+        return list(sig_params).index(arg)
+
+
+def get_tensor_kwarg(func, kwargs):
+    func_args = get_default_args(func)
+    func_args.update(kwargs)
+    arg = None
+
+    if 'tensor' in func_args:
+        arg = func_args['tensor']
+    elif 'input_list' in func_args:
+        arg = func_args['input_list']
+    elif 'input_tensor_list' in func_args:
+        arg = func_args['input_tensor_list']
+    return arg
+
+
+def get_msg_size_from_args(func, *args, **kwargs):
+    # 3 cases:
+    #   - tensor arg is in args
+    #   - tensor arg is in kwargs
+    #   - tensor arg is not present (e.g. barrier)
+    tensor_arg_position = -1
+    tensor_arg = None
+    # check if tensor arg is in args
+    if len(args) > 0:
+        tensor_arg_position = get_tensor_position(func)
+        if tensor_arg_position > -1:
+            tensor_arg = args[get_tensor_position(func)]
+    # check if tensor arg is in kwargs
+    if tensor_arg is None and len(kwargs) > 0:
+        tensor_arg = get_tensor_kwarg(func, kwargs)
+    # if tensor arg is not present, no data is being transmitted
+    if tensor_arg is None:
+        return 0
+    else:
+        # Sum of tensor sizes for list colls such as torch's all_to_all
+        # NOTE: msg_size for list colls will not be the actual size transmitted by a given MPI/NCCL call within the coll op. Instead, it's the total amount of data transmitted.
+        if type(tensor_arg) is list:
+            return sum(x.element_size() * x.nelement() for x in tensor_arg)
+        else:
+            return tensor_arg.element_size() * tensor_arg.nelement()
+
+
+def get_debug_log_name(func_args, debug):
+    if debug:
+        return func_args['log_name'] + ' | [Caller Func: ' + get_caller_func() + ']'
+    else:
+        return func_args['log_name']
diff --git a/deepspeed/compression/__init__.py b/deepspeed/compression/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e4974e69819c53f443a2bf348ccfa6619ddb5ad
--- /dev/null
+++ b/deepspeed/compression/__init__.py
@@ -0,0 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .compress import init_compression, redundancy_clean
+from .scheduler import compression_scheduler
+from .helper import convert_conv1d_to_linear
diff --git a/deepspeed/compression/basic_layer.py b/deepspeed/compression/basic_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a15f80da188e18ad98a6e9416f99a348be1ee396
--- /dev/null
+++ b/deepspeed/compression/basic_layer.py
@@ -0,0 +1,925 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import math
+from torch import nn
+from torch.nn import init
+import deepspeed.comm as dist
+from .utils import TopKBinarizer, SymQuantizer, AsymQuantizer, TernaryQuantizer, BinaryQuantizer
+from deepspeed.utils import logger
+
+g_mpu = None
+
+
+class QuantAct(nn.Module):
+    """
+    Class to quantize given activations. Note that when using this function, the input acttivation quantization range will be fixed for all
+    tokens/images for inference. This generally will affect some accuracy but achieve better latency performance.
+    Parameters:
+    ----------
+    act_range_momentum : float, default 0.95
+        Momentum for updating the activation quantization range.
+    quant_mode : str, default 'symmetric'
+    """
+    def __init__(self, act_range_momentum=0.95, quant_mode='symmetric'):
+        super(QuantAct, self).__init__()
+
+        self.act_range_momentum = act_range_momentum
+        self.quant_mode = quant_mode
+        if quant_mode == 'symmetric':
+            self.act_function = SymQuantizer.apply
+        else:
+            self.act_function = AsymQuantizer.apply
+
+        self.register_buffer('x_min_max', torch.zeros(2))
+
+    def forward(self, x, num_bits, *args):
+        """
+        x: the activation that we need to quantize
+        num_bits: the number of bits we need to quantize the activation to
+        *args: some extra arguments that are useless but needed for align with the interface of other quantization functions
+        """
+
+        if self.training:
+            x_min = x.data.min()
+            x_max = x.data.max()
+
+            # Initialization
+            if self.x_min_max[0] == self.x_min_max[1]:
+                self.x_min_max[0] = x_min
+                self.x_min_max[1] = x_max
+
+            # if do not need momentum, please set self.act_range_momentum = 0
+            self.x_min_max[0] = self.x_min_max[0] * self.act_range_momentum + x_min * (
+                1 - self.act_range_momentum)
+            self.x_min_max[1] = self.x_min_max[1] * self.act_range_momentum + x_max * (
+                1 - self.act_range_momentum)
+
+        x_q = self.act_function(x, num_bits, self.x_min_max[0], self.x_min_max[1])
+
+        return x_q
+
+
+class Embedding_Compress(nn.Embedding):
+    def __init__(self, *kargs):
+        super(Embedding_Compress, self).__init__(*kargs)
+        self.weight.start_bits = None
+        self.weight.target_bits = None
+        self.weight.q_period = None
+        self.weight_quantization_enabled_in_forward = False
+        self.weight_quantization_enabled = False
+
+    def extra_repr(self):
+        return 'num_embeddings={}, embedding_dim={}, weight_quantization={}'.format(
+            self.num_embeddings,
+            self.embedding_dim,
+            self.weight.target_bits)
+
+    def enable_weight_quantization(self,
+                                   start_bits,
+                                   target_bits,
+                                   quantization_period,
+                                   weight_quantization_enabled_in_forward,
+                                   quantization_type,
+                                   num_groups):
+        self.weight.start_bits = start_bits
+        self.weight.target_bits = target_bits
+        self.weight.q_period = quantization_period
+        self.weight_quantization_enabled_in_forward = weight_quantization_enabled_in_forward
+        if self.weight_quantization_enabled_in_forward:
+            logger.warning(
+                "************ A lot of MoQ features are not supported in quantize_weight_in_forward mode, please consider to use DS-FP16 optimizer************"
+            )
+            if self.weight.target_bits >= 3:
+                if quantization_type == 'symmetric':
+                    self.weight_quantizer = SymQuantizer.apply
+                else:
+                    self.weight_quantizer = AsymQuantizer.apply
+            elif self.weight.target_bits == 2:
+                assert quantization_type == 'symmetric', 'Only symmetric quantization is supported for ternary weight quantization'
+                self.weight_quantizer = TernaryQuantizer.apply
+            elif self.weight.target_bits == 1:
+                assert quantization_type == 'symmetric', 'Only symmetric quantization is supported for binary weight quantization'
+                self.weight_quantizer = BinaryQuantizer.apply
+            # for embedding, we always use token-wise quantization
+            self.weight_quantize_num_groups = self.weight.size(0)
+
+    def fix_weight_quantization(self):
+        self.weight.data = self.weight_quantizer(self.weight,
+                                                 self.weight.target_bits,
+                                                 None,
+                                                 None,
+                                                 self.weight_quantize_num_groups).data
+        self.weight_quantization_enabled_in_forward = False
+        return None
+
+    def forward(self, input):
+        if self.weight_quantization_enabled_in_forward and self.weight_quantization_enabled:
+            weight = self.weight_quantizer(self.weight,
+                                           self.weight.target_bits,
+                                           None,
+                                           None,
+                                           self.weight_quantize_num_groups)
+        else:
+            weight = self.weight
+
+        out = nn.functional.embedding(input,
+                                      weight,
+                                      self.padding_idx,
+                                      self.max_norm,
+                                      self.norm_type,
+                                      self.scale_grad_by_freq,
+                                      self.sparse)
+        return out
+
+
+class LinearLayer_Compress(nn.Linear):
+    """
+    Linear layer with compression.
+    """
+    def __init__(self, *kargs, bias=True):
+        super(LinearLayer_Compress, self).__init__(*kargs, bias=bias)
+        self.sparse_pruning_method = None
+        self.row_pruning_method = None
+        self.head_pruning_method = None
+        self.activation_quantization_method = None
+        self.weight.start_bits = None
+        self.weight.target_bits = None
+        self.weight.q_period = None
+        self.weight_quantization_enabled_in_forward = False
+        self.weight_quantization_enabled = False
+        self.sparse_pruning_enabled = False
+        self.row_pruning_enabled = False
+        self.head_pruning_enabled = False
+        self.activation_quantization_enabled = False
+
+    def extra_repr(self):
+        return 'in_features={}, out_features={}, bias={}, sparse pruning={}, row pruning={}, head pruning={}, activation quantization={}, weight_quantization={}'.format(
+            self.in_features, self.out_features, self.bias is not None, self.sparse_pruning_method is not None, \
+            self.row_pruning_method is not None, self.head_pruning_method is not None, self.activation_quantization_method is not None, self.weight.target_bits)
+
+    def enable_sparse_pruning(self, ratio, method):
+        # Here, we support two cases: L1 norm based pruning and topk based pruning
+        self.sparse_pruning_ratio = ratio
+        self.sparse_pruning_method = method
+        if method == 'l1':
+            weight_norm = torch.abs(self.weight.data)
+            mask = TopKBinarizer.apply(weight_norm, self.sparse_pruning_ratio, False)
+            mask = mask.view(self.weight.size())
+            mask = mask.to(self.weight.device)
+        elif method == 'topk':
+            self.sparse_mask_scores = nn.Parameter(torch.Tensor(self.weight.size()))
+            self.sparse_mask_scores.data = self.sparse_mask_scores.data.to(
+                self.weight.device)
+            init.kaiming_uniform_(self.sparse_mask_scores, a=math.sqrt(5))
+            mask = None
+        else:
+            raise NotImplementedError
+
+        self.register_buffer('sparse_pruning_mask', mask)
+
+    def enable_row_pruning(self, ratio, method):
+        # Here, we support two cases: L1 norm based pruning and topk based pruning
+        self.row_pruning_ratio = ratio
+        self.row_pruning_method = method
+
+        if method == 'l1':
+            # compute the l1 norm of each column
+            weight_norm = torch.norm(self.weight.data, p=1, dim=1)
+            mask = TopKBinarizer.apply(weight_norm, self.row_pruning_ratio, False)
+            mask = mask.view(-1, 1)
+            mask = mask.to(self.weight.device)
+        elif method == 'topk':
+            self.row_mask_scores = nn.Parameter(torch.Tensor(self.weight.size(0), 1))
+            self.row_mask_scores.data = self.row_mask_scores.data.to(self.weight.device)
+            init.kaiming_uniform_(self.row_mask_scores, a=math.sqrt(5))
+            mask = None
+        else:
+            raise NotImplementedError
+
+        self.register_buffer('row_pruning_mask', mask)
+
+    def enable_head_pruning(self, ratio, method, num_heads):
+        # Here, we support only topk based pruning
+        self.num_heads = num_heads
+        self.head_pruning_ratio = ratio
+        self.head_pruning_method = method
+
+        if method not in ['topk']:
+            raise NotImplementedError
+        else:
+            self.head_pruning_ratio = ratio
+            self.head_pruning_scores = nn.Parameter(torch.Tensor(
+                1,
+                self.num_heads))  # we apply the pruning to O matrix
+            self.head_pruning_scores.data = self.head_pruning_scores.data.to(
+                self.weight.device)
+            init.kaiming_uniform_(self.head_pruning_scores, a=math.sqrt(5))
+
+    def fix_sparse_pruning_helper(self):
+        mask = self.get_mask(pruning_type='sparse')
+        self.weight.data = self.weight.data * mask
+        del self.sparse_pruning_mask
+        if self.sparse_pruning_method == 'topk':
+            del self.sparse_mask_scores
+        self.sparse_pruning_method = None
+        self.sparse_pruning_enabled = False
+        return None
+
+    def fix_row_col_pruning_helper(self, mask=None, dim_reduction=False):
+        # This function is used for row/col pruning
+        # particularly, if we have two back-to-back layers, F1 and F2; when
+        # we remove rows from F1, we also need to remove columns from F2
+        # However, if we only have one layer, F1, then we only need to mask pruned
+        # rows as 0 in F1
+        if mask is None:
+            mask = self.get_mask(pruning_type='row').bool()
+            if dim_reduction:
+                start_bits = self.weight.start_bits
+                target_bits = self.weight.target_bits
+                q_period = self.weight.q_period
+                self.weight = nn.Parameter(self.weight.data[mask.view(-1), :])
+                self.weight.start_bits = start_bits
+                self.weight.target_bits = target_bits
+                self.weight.q_period = q_period
+                if self.bias is not None:
+                    self.bias = nn.Parameter(self.bias.data[mask.view(-1)])
+                self.out_features = self.weight.size(0)
+            else:
+                self.weight.data = self.weight.data * mask.view(-1, 1)
+                if self.bias is not None:
+                    self.bias.data = self.bias.data * mask.view(-1)
+
+            del self.row_pruning_mask
+            if self.row_pruning_method == 'topk':
+                del self.row_mask_scores
+            self.row_pruning_method = None
+        else:
+            # this is generally for column pruning
+            start_bits = self.weight.start_bits
+            target_bits = self.weight.target_bits
+            q_period = self.weight.q_period
+            self.weight = nn.Parameter(self.weight.data[:, mask.view(-1)])
+            self.weight.start_bits = start_bits
+            self.weight.target_bits = target_bits
+            self.weight.q_period = q_period
+            self.in_features = self.weight.size(1)
+            mask = None
+        self.row_pruning_enabled = False
+        return mask
+
+    def fix_head_pruning_helper(self, mask=None, num_heads=None, dim_reduction=False):
+        # similar as row/col pruning, head pruning also needs to prune QKV which is associated with O matrix
+        num_heads = num_heads if num_heads else self.num_heads
+        if mask is None:
+            if self.head_pruning_method == 'topk':
+                mask = self.get_mask(pruning_type='head').bool()
+                if dim_reduction:
+                    shape = self.weight.size(0)
+                    start_bits = self.weight.start_bits
+                    target_bits = self.weight.target_bits
+                    q_period = self.weight.q_period
+                    self.weight = nn.Parameter(self.weight.data.t().reshape(num_heads, -1)[mask.view(-1), :].reshape(-1, shape).t())
+                    self.weight.start_bits = start_bits
+                    self.weight.target_bits = target_bits
+                    self.weight.q_period = q_period
+                else:
+
+                    shape = self.weight.size()
+                    self.weight.data = (self.weight.data.t().reshape(self.num_heads,
+                                                                     -1) *
+                                        mask.view(-1,
+                                                  1)).reshape(shape[1],
+                                                              shape[0]).t()
+
+                if self.head_pruning_method == 'topk':
+                    del self.head_pruning_scores
+                self.head_pruning_method = None
+            else:
+                raise NotImplementedError
+        else:
+            start_bits = self.weight.start_bits
+            target_bits = self.weight.target_bits
+            q_period = self.weight.q_period
+            shape = self.weight.size(1)
+            self.weight = nn.Parameter(self.weight.data.reshape(num_heads, -1)[mask.view(-1), :].reshape(-1, shape))
+            self.weight.start_bits = start_bits
+            self.weight.target_bits = target_bits
+            self.weight.q_period = q_period
+            if self.bias is not None:
+                self.bias = nn.Parameter(self.bias.data.reshape(num_heads, -1)[mask.view(-1), :].reshape(-1))
+        self.head_pruning_enabled = False
+        return mask
+
+    def get_mask(self, pruning_type='row'):
+        if pruning_type == 'sparse':
+            if self.sparse_pruning_method == 'l1':
+                return self.sparse_pruning_mask.to(self.weight.device)
+            elif self.sparse_pruning_method == 'topk':
+                return TopKBinarizer.apply(self.sparse_mask_scores,
+                                           self.sparse_pruning_ratio,
+                                           False)
+            else:
+                raise NotImplementedError
+        if pruning_type == 'row':
+            if self.row_pruning_method == 'l1':
+                return self.row_pruning_mask.to(self.weight.device)
+            elif self.row_pruning_method == 'topk':
+                return TopKBinarizer.apply(self.row_mask_scores,
+                                           self.row_pruning_ratio,
+                                           False)
+            else:
+                raise NotImplementedError
+        elif pruning_type == 'head':
+            if self.head_pruning_method == 'topk':
+                return TopKBinarizer.apply(self.head_pruning_scores,
+                                           self.head_pruning_ratio,
+                                           False)
+            else:
+                raise NotImplementedError
+        else:
+            raise NotImplementedError
+
+    def enable_weight_quantization(self,
+                                   start_bits,
+                                   target_bits,
+                                   quantization_period,
+                                   weight_quantization_enabled_in_forward,
+                                   quantization_type,
+                                   num_groups):
+        self.weight.start_bits = start_bits
+        self.weight.target_bits = target_bits
+        self.weight.q_period = quantization_period
+        self.weight_quantization_enabled_in_forward = weight_quantization_enabled_in_forward
+        if self.weight_quantization_enabled_in_forward:
+            logger.warning(
+                "************ A lot of MoQ features are not supported in quantize_weight_in_forward mode, please consider to use DS-FP16 optimizer************"
+            )
+            if self.weight.target_bits >= 3:
+                if quantization_type == 'symmetric':
+                    self.weight_quantizer = SymQuantizer.apply
+                else:
+                    self.weight_quantizer = AsymQuantizer.apply
+            elif self.weight.target_bits == 2:
+                assert quantization_type == 'symmetric', 'Only symmetric quantization is supported for ternary weight quantization'
+                self.weight_quantizer = TernaryQuantizer.apply
+            elif self.weight.target_bits == 1:
+                assert quantization_type == 'symmetric', 'Only symmetric quantization is supported for binary weight quantization'
+                self.weight_quantizer = BinaryQuantizer.apply
+            self.weight_quantize_num_groups = num_groups
+
+    def fix_weight_quantization(self):
+        self.weight.data = self.weight_quantizer(self.weight,
+                                                 self.weight.target_bits,
+                                                 None,
+                                                 None,
+                                                 self.weight_quantize_num_groups).data
+        self.weight_quantization_enabled_in_forward = False
+        return None
+
+    def enable_activation_quantization(self, bits, quantization_type, range_calibration):
+        assert bits in [4, 8], 'Only 4/8 bits activation quantization are supported for now'
+        self.activation_quantization_bits = bits
+        self.activation_quantization_method = f"{quantization_type}_{range_calibration}"
+        if range_calibration == 'static':
+            self.activation_quantizer = QuantAct(quant_mode=quantization_type)
+        else:
+            if quantization_type == 'symmetric':
+                self.activation_quantizer = SymQuantizer.apply
+            else:
+                self.activation_quantizer = AsymQuantizer.apply
+
+    def head_pruning_reshape(self, w, mask):
+        shape = w.shape
+        return (w.t().reshape(self.num_heads,
+                              -1) * mask.view(-1,
+                                              1)).reshape(shape[1],
+                                                          shape[0]).t()
+
+    def forward(self, input, skip_bias_add=False):
+
+        if self.weight_quantization_enabled_in_forward and self.weight_quantization_enabled:
+            weight = self.weight_quantizer(self.weight,
+                                           self.weight.target_bits,
+                                           None,
+                                           None,
+                                           self.weight_quantize_num_groups)
+            bias = self.bias
+        else:
+            weight = self.weight
+            bias = self.bias
+
+        if self.sparse_pruning_enabled and self.sparse_pruning_method:
+            mask = self.get_mask(pruning_type='sparse')
+            weight = weight * mask.view(self.weight.size())
+
+        if self.row_pruning_enabled and self.row_pruning_method:
+            mask = self.get_mask(pruning_type='row')
+            weight = weight * mask.view(-1, 1)
+            if bias is not None:
+                bias = bias * mask.view(-1)
+
+        if self.head_pruning_enabled and self.head_pruning_method:
+            mask = self.get_mask(pruning_type='head')
+            weight = self.head_pruning_reshape(weight, mask)
+
+        if self.activation_quantization_enabled:
+            if 'dynamic' in self.activation_quantization_method:
+                num_groups = input.numel() // input.size(-1)
+            else:
+                num_groups = 1
+            input = self.activation_quantizer(input,
+                                              self.activation_quantization_bits,
+                                              None,
+                                              None,
+                                              num_groups)
+
+        if skip_bias_add:
+            # used for mpu linear layers
+            output = nn.functional.linear(input, weight, None)
+            return output, bias
+        else:
+            output = nn.functional.linear(input, weight, bias)
+            return output
+
+
+class Conv2dLayer_Compress(nn.Conv2d):
+    """
+    Conv2D layer with compression.
+    """
+    def __init__(self, *kargs):
+        super(Conv2dLayer_Compress, self).__init__(*kargs)
+        self.sparse_pruning_method = None
+        self.channel_pruning_method = None
+        self.activation_quantization_method = None
+        self.weight.start_bits = None
+        self.weight.target_bits = None
+        self.weight.q_period = None
+        self.weight_quantization_enabled_in_forward = False
+        self.sparse_pruning_enabled = False
+        self.channel_pruning_enabled = False
+        self.activation_quantization_enabled = False
+
+    def __repr__(self):
+        s = ('{in_channels}, {out_channels}, kernel_size={kernel_size}'
+             ', stride={stride}')
+        if self.padding != (0, ) * len(self.padding):
+            s += ', padding={padding}'
+        if self.dilation != (1, ) * len(self.dilation):
+            s += ', dilation={dilation}'
+        if self.output_padding != (0, ) * len(self.output_padding):
+            s += ', output_padding={output_padding}'
+        if self.groups != 1:
+            s += ', groups={groups}'
+        if self.bias is None:
+            s += ', bias=False'
+        if self.padding_mode != 'zeros':
+            s += ', padding_mode={padding_mode}'
+        output = s.format(**self.__dict__)
+
+        return output + ' sparse pruning={}, channel pruning={}, activation quantization={}, weight_quantization={}'.format(
+            self.sparse_pruning_method is not None,
+            self.channel_pruning_method is not None,
+            self.activation_quantization_method is not None,
+            self.weight.target_bits)
+
+    def enable_sparse_pruning(self, ratio, method):
+        self.sparse_pruning_ratio = ratio
+        self.sparse_pruning_method = method
+        if method == 'l1':
+            weight_norm = torch.abs(self.weight.data)
+            mask = TopKBinarizer.apply(weight_norm, self.sparse_pruning_ratio, False)
+            mask = mask.view(self.weight.size())
+            mask = mask.to(self.weight.device)
+        elif method == 'topk':
+            self.sparse_mask_scores = nn.Parameter(torch.Tensor(self.weight.size()))
+            self.sparse_mask_scores.data = self.sparse_mask_scores.data.to(
+                self.weight.device)
+            init.kaiming_uniform_(self.sparse_mask_scores, a=math.sqrt(5))
+            mask = None
+        else:
+            raise NotImplementedError
+
+        self.register_buffer('sparse_pruning_mask', mask)
+
+    def enable_channel_pruning(self, ratio, method):
+        # Here, we support two cases: L1 norm based pruning and topk based pruning
+        self.channel_pruning_ratio = ratio
+        self.channel_pruning_method = method
+
+        if method == 'l1':
+            # compute the l1 norm of each conv2d kernel (the last three dimension)
+            weight_norm = torch.norm(self.weight.data, p=1, dim=[1, 2, 3])
+            mask = TopKBinarizer.apply(weight_norm, self.channel_pruning_ratio, False)
+            mask = mask.view(-1, 1, 1, 1)
+            mask = mask.to(self.weight.device)
+        elif method == 'topk':
+            self.channel_mask_scores = nn.Parameter(
+                torch.Tensor(self.weight.size(0),
+                             1,
+                             1,
+                             1))
+            self.channel_mask_scores.data = self.channel_mask_scores.data.to(
+                self.weight.device)
+            init.kaiming_uniform_(self.channel_mask_scores, a=math.sqrt(5))
+            mask = None
+        else:
+            raise NotImplementedError
+
+        self.register_buffer('channel_pruning_mask', mask)
+
+    def fix_sparse_pruning_helper(self):
+        mask = self.get_mask(pruning_type='sparse')
+        self.weight.data = self.weight.data * mask
+        del self.sparse_pruning_mask
+        if self.sparse_pruning_method == 'topk':
+            del self.sparse_mask_scores
+        self.sparse_pruning_method = None
+        self.sparse_pruning_enabled = False
+        return None
+
+    def fix_channel_pruning_helper(self, mask=None, dim_reduction=False):
+        if mask is None:
+            if self.channel_pruning_method in ['l1', 'topk']:
+                mask = self.get_mask(pruning_type='channel').bool()
+                if dim_reduction:
+                    start_bits = self.weight.start_bits
+                    target_bits = self.weight.target_bits
+                    q_period = self.weight.q_period
+                    self.weight = nn.Parameter(self.weight.data[mask.view(-1), ...])
+                    self.weight.start_bits = start_bits
+                    self.weight.target_bits = target_bits
+                    self.weight.q_period = q_period
+                    if self.bias is not None:
+                        self.bias = nn.Parameter(self.bias.data[mask.view(-1)])
+                else:
+                    self.weight.data = self.weight.data * mask.view(-1, 1, 1, 1)
+                    if self.bias is not None:
+                        self.bias.data = self.bias.data * mask.view(-1)
+                del self.channel_pruning_mask
+                if self.channel_pruning_method == 'topk':
+                    del self.channel_mask_scores
+                self.channel_pruning_method = None
+            else:
+                raise NotImplementedError
+        else:
+            start_bits = self.weight.start_bits
+            target_bits = self.weight.target_bits
+            q_period = self.weight.q_period
+            self.weight = nn.Parameter(self.weight.data[:, mask.view(-1), ...])
+            self.weight.start_bits = start_bits
+            self.weight.target_bits = target_bits
+            self.weight.q_period = q_period
+            mask = None
+        self.channel_pruning_enabled = False
+        return mask
+
+    def get_mask(self, pruning_type='sparse'):
+        if pruning_type == 'sparse':
+            if self.sparse_pruning_method == 'l1':
+                return self.sparse_pruning_mask.to(self.weight.device)
+            elif self.sparse_pruning_method == 'topk':
+                return TopKBinarizer.apply(self.sparse_mask_scores,
+                                           self.sparse_pruning_ratio,
+                                           False)
+            else:
+                raise NotImplementedError
+        elif pruning_type == 'channel':
+            if self.channel_pruning_method == 'l1':
+                return self.channel_pruning_mask.to(self.weight.device)
+            elif self.channel_pruning_method == 'topk':
+                return TopKBinarizer.apply(self.channel_mask_scores,
+                                           self.channel_pruning_ratio,
+                                           False)
+            else:
+                raise NotImplementedError
+        else:
+            raise NotImplementedError
+
+    def fix_weight_quantization(self):
+        self.weight.data = self.weight_quantizer(self.weight,
+                                                 self.weight.target_bits,
+                                                 None,
+                                                 None,
+                                                 self.weight_quantize_num_groups).data
+        self.weight_quantization_enabled_in_forward = False
+        return None
+
+    def enable_weight_quantization(self,
+                                   start_bits,
+                                   target_bits,
+                                   quantization_period,
+                                   weight_quantization_enabled_in_forward,
+                                   quantization_type,
+                                   num_groups):
+        self.weight.start_bits = start_bits
+        self.weight.target_bits = target_bits
+        self.weight.q_period = quantization_period
+        self.weight_quantization_enabled_in_forward = weight_quantization_enabled_in_forward
+        if self.weight_quantization_enabled_in_forward:
+            assert self.weight.target_bits >= 4, 'Only >=4 bits weight quantization are supported during forward pass for now'
+            logger.warning(
+                "************ A lot of MoQ features are not supported in quantize_weight_in_forward mode, please consider to use DS-FP16 optimizer************"
+            )
+            if quantization_type == 'symmetric':
+                self.weight_quantizer = SymQuantizer.apply
+            else:
+                self.weight_quantizer = AsymQuantizer.apply
+            self.weight_quantize_num_groups = num_groups
+
+    def enable_activation_quantization(self, bits, quantization_type, range_calibration):
+        assert bits in [4, 8], 'Only 4/8 bits activation quantization are supported for now'
+        self.activation_quantization_bits = bits
+        self.activation_quantization_method = f"{quantization_type}_{range_calibration}"
+        if range_calibration == 'static':
+            self.activation_quantizer = QuantAct(quant_mode=quantization_type)
+        else:
+            if quantization_type == 'symmetric':
+                self.activation_quantizer = SymQuantizer.apply
+            else:
+                self.activation_quantizer = AsymQuantizer.apply
+
+    def forward(self, input):
+
+        if self.weight_quantization_enabled_in_forward and self.weight_quantization_enabled:
+            weight = self.weight_quantizer(self.weight,
+                                           self.weight.target_bits,
+                                           None,
+                                           None,
+                                           self.weight_quantize_num_groups)
+            bias = self.bias
+        else:
+            weight = self.weight
+            bias = self.bias
+
+        if self.sparse_pruning_enabled and self.sparse_pruning_method:
+            mask = self.get_mask(pruning_type='sparse')
+            weight = weight * mask.view(self.weight.size())
+
+        if self.channel_pruning_enabled:
+            mask = self.get_mask(pruning_type='channel')
+            weight = weight * mask.view(-1, 1, 1, 1)
+            if bias is not None:
+                bias = bias * mask.view(-1)
+
+        if self.activation_quantization_enabled:
+            if 'dynamic' in self.activation_quantization_method:
+                num_groups = input.numel() // input[0].numel()
+            else:
+                num_groups = 1
+            input = self.activation_quantizer(input,
+                                              self.activation_quantization_bits,
+                                              None,
+                                              None,
+                                              num_groups)
+
+        return nn.functional.conv2d(input,
+                                    weight,
+                                    bias,
+                                    self.stride,
+                                    self.padding,
+                                    self.dilation,
+                                    self.groups)
+
+
+class BNLayer_Compress(nn.BatchNorm2d):
+    def fix_channel_pruning_helper(self, mask, dim_reduction=True):
+        self.weight = nn.Parameter(self.weight.data[mask.view(-1)])
+        self.bias = nn.Parameter(self.bias.data[mask.view(-1)])
+        self.running_mean = self.running_mean[mask.view(-1)]
+        self.running_var = self.running_var[mask.view(-1)]
+
+
+def _reduce(input_):
+    """All-reduce the the input tensor across model parallel group."""
+    group = g_mpu.get_model_parallel_group()
+
+    # Bypass the function if we are using only 1 GPU.
+    if dist.get_world_size(group=group) == 1:
+        return input_
+
+    # All-reduce.
+    dist.all_reduce(input_, group=group)
+
+    return input_
+
+
+def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks=False):
+    """Split a tensor along its last dimension.
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    assert tensor.size()[last_dim] % num_partitions == 0
+    last_dim_size = tensor.size()[last_dim] // num_partitions
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+def _split(input_):
+    """Split the tensor along its last dimension and keep the
+    corresponding slice."""
+    group = g_mpu.get_model_parallel_group()
+
+    # Bypass the function if we are using only 1 GPU.
+    if dist.get_world_size(group=group) == 1:
+        return input_
+
+    # Split along last dimension.
+    world_size = dist.get_world_size(group=group)
+    input_list = split_tensor_along_last_dim(input_, world_size)
+
+    # Note: torch.split does not create contiguous tensors by default.
+    rank = dist.get_rank(group=group)
+    output = input_list[rank].contiguous()
+
+    return output
+
+
+def _gather(input_):
+    """Gather tensors and concatinate along the last dimension."""
+    group = g_mpu.get_model_parallel_group()
+
+    # Bypass the function if we are using only 1 GPU.
+    if dist.get_world_size(group=group) == 1:
+        return input_
+
+    # Size and dimension.
+    last_dim = input_.dim() - 1
+    rank = dist.get_rank(group=group)
+    world_size = dist.get_world_size(group=group)
+
+    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+    tensor_list[rank] = input_
+    dist.all_gather(tensor_list, input_, group=group)
+
+    # Note: torch.cat already creates a contiguous tensor.
+    output = torch.cat(tensor_list, dim=last_dim).contiguous()
+
+    return output
+
+
+class _CopyToModelParallelRegion(torch.autograd.Function):
+    """Pass the input to the model parallel region."""
+    @staticmethod
+    def forward(ctx, input_):
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _reduce(grad_output)
+
+
+class _ReduceFromModelParallelRegion(torch.autograd.Function):
+    """All-redcue the input from the model parallel region."""
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+
+
+class _ScatterToModelParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+    @staticmethod
+    def forward(ctx, input_):
+        return _split(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather(grad_output)
+
+
+class _GatherFromModelParallelRegion(torch.autograd.Function):
+    """Gather the input from model parallel region and concatinate."""
+    @staticmethod
+    def forward(ctx, input_):
+        return _gather(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _split(grad_output)
+
+
+# -----------------
+# Helper functions.
+# -----------------
+
+
+def copy_to_model_parallel_region(input_):
+    return _CopyToModelParallelRegion.apply(input_)
+
+
+def reduce_from_model_parallel_region(input_):
+    return _ReduceFromModelParallelRegion.apply(input_)
+
+
+def scatter_to_model_parallel_region(input_):
+    return _ScatterToModelParallelRegion.apply(input_)
+
+
+def gather_from_model_parallel_region(input_):
+    return _GatherFromModelParallelRegion.apply(input_)
+
+
+class ColumnParallelLinear_Compress(LinearLayer_Compress):
+    def __init__(self,
+                 mpu,
+                 input_size,
+                 output_size,
+                 bias=True,
+                 gather_output=True,
+                 skip_bias_add=False):
+        # Keep input parameters
+        global g_mpu
+        g_mpu = mpu
+        self.input_size = input_size
+        self.output_size = output_size
+        self.gather_output = gather_output
+        self.skip_bias_add = skip_bias_add
+
+        # Divide the weight matrix along the last dimension.
+        world_size = mpu.get_model_parallel_world_size()
+        assert output_size % world_size == 0
+        self.output_size_per_partition = output_size // world_size
+
+        super(ColumnParallelLinear_Compress,
+              self).__init__(self.input_size,
+                             self.output_size_per_partition,
+                             bias=bias)
+
+    def forward(self, input_):
+        # Set up backprop all-reduce.
+        input_parallel = copy_to_model_parallel_region(input_)
+        # Matrix multiply.
+        if self.skip_bias_add:
+            output_parallel, bias = super().forward(input_parallel, True)
+        else:
+            output_parallel = super().forward(input_parallel)
+            bias = None
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = gather_from_model_parallel_region(output_parallel)
+        else:
+            output = output_parallel
+        return output, bias
+
+
+class RowParallelLinear_Compress(LinearLayer_Compress):
+    def __init__(self,
+                 mpu,
+                 input_size,
+                 output_size,
+                 bias=True,
+                 input_is_parallel=False,
+                 skip_bias_add=False):
+        # Keep input parameters
+        global g_mpu
+        g_mpu = mpu
+        self.input_size = input_size
+        self.output_size = output_size
+        self.input_is_parallel = input_is_parallel
+        self.skip_bias_add = skip_bias_add
+
+        # Divide the weight matrix along the last dimension.
+        world_size = mpu.get_model_parallel_world_size()
+        assert input_size % world_size == 0
+        self.input_size_per_partition = input_size // world_size
+
+        super(RowParallelLinear_Compress,
+              self).__init__(self.input_size_per_partition,
+                             self.output_size,
+                             bias=bias)
+
+    def forward(self, input_):
+        # Set up backprop all-reduce.
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            input_parallel = scatter_to_model_parallel_region(input_)
+        # Matrix multiply.
+        output_parallel, bias = super().forward(input_parallel, True)
+
+        # All-reduce across all the partitions.
+        output_ = reduce_from_model_parallel_region(output_parallel)
+        if not self.skip_bias_add:
+            if bias is not None:
+                output = output_ + bias
+            else:
+                output = output_
+            output_bias = None
+        else:
+            output = output_
+            output_bias = bias
+        return output, output_bias
diff --git a/deepspeed/compression/compress.py b/deepspeed/compression/compress.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf3b6c2760fa5b31b737ef62eea162e5418d4d74
--- /dev/null
+++ b/deepspeed/compression/compress.py
@@ -0,0 +1,233 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import re
+from .helper import compression_preparation, fix_compression, recursive_getattr, is_module_compressible
+from .config import get_compression_config
+from ..runtime.config_utils import dict_raise_error_on_duplicate_keys
+from .constants import *
+import os
+import json
+
+
+def check_deepspeed_config(config):
+    if isinstance(config, dict):
+        return config
+    elif os.path.exists(config):
+        return json.load(open(config,
+                              "r"),
+                         object_pairs_hook=dict_raise_error_on_duplicate_keys)
+    else:
+        raise ValueError(
+            f"Expected a string path to an existing deepspeed config, or a dictionary. Received: {config}"
+        )
+
+
+def get_module_name(group_name,
+                    model,
+                    key_word,
+                    exist_module_name,
+                    mpu=None,
+                    verbose=True):
+    '''
+    get the associated module name from the model based on the key_word provided by users
+    '''
+    return_module_name = []
+    for name, module in model.named_modules():
+
+        module_check = is_module_compressible(module, mpu)
+
+        if re.search(key_word, name) is not None and module_check:
+            if name in exist_module_name and verbose:
+                # logger.warning
+                raise ValueError(
+                    f"{name} is already added to compression, please check your config file for {group_name}."
+                )
+            if name not in exist_module_name:
+                exist_module_name.add(name)
+                return_module_name.append(name)
+    return return_module_name, exist_module_name
+
+
+def get_compress_methods(model, compress_methods, mpu=None):
+    # extract the compression module for each method in compress_methods
+    layer_added_compress_methods = []
+    for method, method_content in compress_methods.items():
+        if LAYER_REDUCTION in method:
+            continue
+        # for loop different methods, i.e., weight quantization, activation quantization etc
+        exist_module_name = set()
+        shared_parameters = method_content[
+            SHARED_PARAMETERS]  # get all the shared parameters
+        for group_name, method_parameters in method_content[DIFFERENT_GROUPS].items():
+            # for loop different groups, i.e., weight quantization group 1, weight quantization group 2 etc
+            module_name_list = []
+            related_module_name_list = []
+            if method_parameters[DIFFERENT_GROUPS_RELATED_MODULE_SCOPE]:
+                # this is used for head/row/channel pruning, if users provide the related module scope, we can shrink the layer dim for them
+                # otherwise we just mask those as zeros
+                for key_word, related_key_words in zip(method_parameters[DIFFERENT_GROUPS_MODULE_SCOPE], method_parameters[DIFFERENT_GROUPS_RELATED_MODULE_SCOPE]):
+                    module_name, exist_module_name = get_module_name(group_name, model, key_word, exist_module_name, mpu=mpu)
+                    module_name_list.append(module_name)
+                    tmp_related_module_name_list = []
+                    for rkw in related_key_words:
+                        # related key word can be a list, for instance the QKV for O matrix in Attention
+                        module_name, _ = get_module_name(group_name, model, rkw, set(), mpu=mpu)
+                        tmp_related_module_name_list.append(module_name)
+                    related_module_name_list.append(tmp_related_module_name_list)
+            else:
+                for key_word in method_parameters[DIFFERENT_GROUPS_MODULE_SCOPE]:
+                    module_name, exist_module_name = get_module_name(group_name, model, key_word, exist_module_name, mpu=mpu)
+                    module_name_list.append(module_name)
+
+            if module_name_list:
+                # combine shared parameters with each group
+                combined_method_parameters = {
+                    **(method_parameters.copy().pop(DIFFERENT_GROUPS_PARAMETERS)),
+                    **shared_parameters
+                }
+                compression_item = [
+                    module_name_list,
+                    related_module_name_list,
+                    {
+                        method: combined_method_parameters
+                    }
+                ]
+                layer_added_compress_methods.append(compression_item)
+    return layer_added_compress_methods
+
+
+def init_compression(model, deepspeed_config, teacher_model=None, mpu=None):
+    """
+    Compress a model: replace linear/conv2d layer with deepspeed compression-aware modules
+    Args:
+        model (`torch.nn.Module`)
+            The model to compress.
+        deepspeed_config (`DeepSpeedConfig`)
+            The path of ds_config
+        mpu
+            The mpu module for Row/Column parallelism
+    """
+    compress_methods = get_compression_config(check_deepspeed_config(deepspeed_config))
+    if hasattr(model, 'module'):
+        c_model = model.module
+    else:
+        c_model = model
+
+    # For layer reduction
+    if compress_methods[LAYER_REDUCTION][LAYER_REDUCTION_ENABLED]:
+        assert teacher_model is not None, "Teacher model is required for layer reduction"
+        student_initialization(c_model, teacher_model, deepspeed_config)
+
+    layer_added_compress_methods = get_compress_methods(c_model,
+                                                        compress_methods,
+                                                        mpu=mpu)
+    compression_preparation(c_model, layer_added_compress_methods, mpu)
+
+    return model
+
+
+def redundancy_clean(model, deepspeed_config, mpu=None):
+    """
+    Remove the redundancy of a model
+    Args:
+        model (`torch.nn.Module`)
+            The model to compress.
+        deepspeed_config (`DeepSpeedConfig`)
+            The path of ds_config
+        mpu
+            The mpu module for Row/Column parallelism
+    """
+    compress_methods = get_compression_config(check_deepspeed_config(deepspeed_config))
+    if hasattr(model, 'module'):
+        c_model = model.module
+    else:
+        c_model = model
+
+    layer_added_compress_methods_tmp = get_compress_methods(c_model,
+                                                            compress_methods,
+                                                            mpu=mpu)
+    # sort methods
+    order_list = [
+        WEIGHT_QUANTIZATION,
+        SPARSE_PRUNING,
+        ROW_PRUNING,
+        HEAD_PRUNING,
+        CHANNEL_PRUNING,
+        ACTIVATION_QUANTIZATION
+    ]
+    layer_added_compress_methods = sorted(
+        layer_added_compress_methods_tmp,
+        key=lambda x: order_list.index(list(x[2].keys())[0]))
+
+    for module_name_lists, related_module_name_lists, compression_technique in layer_added_compress_methods:
+        stored_mask = []
+        need_mask = True if related_module_name_lists else False
+        for i, mnl in enumerate(module_name_lists):
+            for module_name in mnl:
+                mask = fix_compression(c_model,
+                                       module_name,
+                                       compression_technique,
+                                       dim_reduction=need_mask)
+                if need_mask:
+                    stored_mask.append(mask)
+            if need_mask:
+                for rmnl in related_module_name_lists[i]:
+                    for j, module_name in enumerate(rmnl):
+                        mask = fix_compression(c_model,
+                                               module_name,
+                                               compression_technique,
+                                               mask=stored_mask[j],
+                                               dim_reduction=True)
+    return model
+
+
+def student_initialization(student_model, teacher_model, deepspeed_config):
+    '''
+    Given a student model and a teacher model, select the
+    Args:
+        student_model (`torch.nn.Module`)
+            The model we will update weight
+        teacher_model (`torch.nn.Module`)
+            The model guide the student to learn
+        deepspeed_config (`DeepSpeedConfig`)
+            The path of ds_config
+    '''
+    config = get_compression_config(check_deepspeed_config(deepspeed_config))
+    compress_methods = config[LAYER_REDUCTION]
+
+    module_name_prefix = compress_methods[MODULE_NAME_PREFIX]
+    teacher_layer = compress_methods[TEACHER_LAYER]
+    student_layer = [i for i in range(len(teacher_layer))]
+    other_module_name = compress_methods[OTHER_MODULE_NAME]
+    '''
+        name_prefix (`str`)
+            The prefix name before the layer #.
+            Example 1: bert.encoder.layer, for BERT_base model's prefix name
+            Example 2: transformer.h, for GPT-2 hugging face prefix name
+        teacher_layer (`list of intergers`)
+            The layer of teacher will be used for student's reinitializedion
+            Example 1: [1,3,5,7,9], means we want to matches the 2nd/4th/6th/8th/10th layer of teacher to the first 5 layers of student
+        student_layer (`list` or None)
+            The layer of student need to be re-intiialized
+            Example 1: None, means we want to reinitialize all the layers
+            Example 1: [0,1,2,3,4], means  we want to reinitialize the first 5 layers
+        other_module_name (`list of string`)
+            The modules will be used for student's reinitializedion
+            Example 1: ['bert.pooler', 'bert.embeddings', 'classifier'], means we want to apply the weight in teacher's embedding/pooler/classier module to the student
+            Example 2: ['transformer.w', 'transformer.ln_f', 'lm_head'], means we want to apply the weight in teacher's embeddingn layers module to the student
+    Note that teacher_layer should matches student layer
+    '''
+    assert len(student_layer) == len(teacher_layer)
+    for s_name, t_name in zip(student_layer, teacher_layer):
+        s_module = recursive_getattr(student_model,
+                                     module_name_prefix + '.' + str(s_name))
+        t_module = recursive_getattr(teacher_model,
+                                     module_name_prefix + '.' + str(t_name))
+        for s_param, t_param in zip(s_module.parameters(), t_module.parameters()):
+            s_param.data.copy_(t_param.data)
+    for name in other_module_name:
+        s_module = recursive_getattr(student_model, name)
+        t_module = recursive_getattr(teacher_model, name)
+        print(name)
+        for s_param, t_param in zip(s_module.parameters(), t_module.parameters()):
+            s_param.data.copy_(t_param.data)
diff --git a/deepspeed/compression/config.py b/deepspeed/compression/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6a710dfa3ea5d6d0ea56b372233fb749f7ca1e8
--- /dev/null
+++ b/deepspeed/compression/config.py
@@ -0,0 +1,492 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .constants import *
+import copy
+from ..runtime.config_utils import get_scalar_param
+
+
+def get_compression_config(param_dict):
+    #
+    output = {}
+
+    if COMPRESSION_TRAINING not in param_dict.keys():
+        param_dict[COMPRESSION_TRAINING] = {}
+    sub_param_dict = param_dict[COMPRESSION_TRAINING]
+    output[WEIGHT_QUANTIZATION] = get_weight_quantization(sub_param_dict)
+    output[ACTIVATION_QUANTIZATION] = get_activation_quantization(sub_param_dict)
+    output[SPARSE_PRUNING] = get_sparse_pruning(sub_param_dict)
+    output[ROW_PRUNING] = get_row_pruning(sub_param_dict)
+    output[HEAD_PRUNING] = get_head_pruning(sub_param_dict)
+    output[CHANNEL_PRUNING] = get_channel_pruning(sub_param_dict)
+
+    output[LAYER_REDUCTION] = get_layer_reduction(sub_param_dict)
+
+    return output
+
+
+def get_layer_reduction(param_dict):
+    output = {}
+    output[LAYER_REDUCTION_ENABLED] = LAYER_REDUCTION_ENABLED_DEFAULT
+    if get_layer_reduction_enabled(param_dict):
+        output[LAYER_REDUCTION_ENABLED] = get_layer_reduction_enabled(param_dict)
+        for key, val in get_layer_reduction_params(param_dict).items():
+            output[key] = val
+    return output
+
+
+def get_layer_reduction_enabled(param_dict):
+    if LAYER_REDUCTION in param_dict.keys():
+        return get_scalar_param(param_dict[LAYER_REDUCTION],
+                                LAYER_REDUCTION_ENABLED,
+                                LAYER_REDUCTION_ENABLED_DEFAULT)
+    else:
+        return False
+
+
+def get_layer_reduction_params(param_dict):
+    if LAYER_REDUCTION in param_dict.keys():
+        layer_reduction_params = copy.copy(param_dict[LAYER_REDUCTION])
+        layer_reduction_params.pop(LAYER_REDUCTION_ENABLED)
+        return layer_reduction_params
+    else:
+        return False
+
+
+def get_quantize_enabled(param_dict):
+    if COMPRESSION_TRAINING not in param_dict.keys():
+        return False
+
+    sub_param_dict = param_dict[COMPRESSION_TRAINING]
+    output = get_weight_quantization_shared_parameters(sub_param_dict)
+    return output[WEIGHT_QUANTIZE_ENABLED]
+
+
+def get_weight_quantization(param_dict):
+    output = {}
+    if WEIGHT_QUANTIZATION not in param_dict.keys():
+        param_dict[WEIGHT_QUANTIZATION] = {SHARED_PARAMETERS: {}, DIFFERENT_GROUPS: {}}
+    sub_param_dict = param_dict[WEIGHT_QUANTIZATION]
+    # shared parameters
+    output[SHARED_PARAMETERS] = get_weight_quantization_shared_parameters(sub_param_dict)
+    # each sub-groups
+    if output[SHARED_PARAMETERS][WEIGHT_QUANTIZE_ENABLED]:
+        assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Weigh Quantization is enabled, {DIFFERENT_GROUPS} must be specified"
+    output[DIFFERENT_GROUPS] = get_weight_quantization_different_groups(sub_param_dict)
+    return output
+
+
+def get_weight_quantization_shared_parameters(param_dict):
+    output = {}
+    if SHARED_PARAMETERS in param_dict.keys():
+        sub_param_dict = param_dict[SHARED_PARAMETERS]
+        output[WEIGHT_QUANTIZE_ENABLED] = get_scalar_param(
+            sub_param_dict,
+            WEIGHT_QUANTIZE_ENABLED,
+            WEIGHT_QUANTIZE_ENABLED_DEFAULT)
+        output[WEIGHT_QUANTIZE_KERNEL] = get_scalar_param(
+            sub_param_dict,
+            WEIGHT_QUANTIZE_KERNEL,
+            WEIGHT_QUANTIZE_KERNEL_DEFAULT)
+        output[WEIGHT_QUANTIZE_SCHEDULE_OFFSET] = get_scalar_param(
+            sub_param_dict,
+            WEIGHT_QUANTIZE_SCHEDULE_OFFSET,
+            WEIGHT_QUANTIZE_SCHEDULE_OFFSET_DEFAULT)
+        output[WEIGHT_QUANTIZE_GROUPS] = get_scalar_param(
+            sub_param_dict,
+            WEIGHT_QUANTIZE_GROUPS,
+            WEIGHT_QUANTIZE_GROUPS_DEFAULT)
+        output[WEIGHT_QUANTIZE_VERBOSE] = get_scalar_param(
+            sub_param_dict,
+            WEIGHT_QUANTIZE_VERBOSE,
+            WEIGHT_QUANTIZE_VERBOSE_DEFAULT)
+        output[WEIGHT_QUANTIZE_TYPE] = get_scalar_param(sub_param_dict,
+                                                        WEIGHT_QUANTIZE_TYPE,
+                                                        WEIGHT_QUANTIZE_TYPE_DEFAULT)
+        output[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED] = get_scalar_param(
+            sub_param_dict,
+            WEIGHT_QUANTIZE_IN_FORWARD_ENABLED,
+            WEIGHT_QUANTIZE_IN_FORWARD_ENABLED_DEFAULT)
+        assert output[WEIGHT_QUANTIZE_TYPE] in [WEIGHT_QUANTIZE_SYMMETRIC, WEIGHT_QUANTIZE_ASYMMETRIC], f"Invalid weight quantize type. Supported types: [{WEIGHT_QUANTIZE_SYMMETRIC}, {WEIGHT_QUANTIZE_ASYMMETRIC}]"
+        output[WEIGHT_QUANTIZE_ROUNDING] = get_scalar_param(
+            sub_param_dict,
+            WEIGHT_QUANTIZE_ROUNDING,
+            WEIGHT_QUANTIZE_ROUNDING_DEFAULT)
+        assert output[WEIGHT_QUANTIZE_ROUNDING] in [WEIGHT_QUANTIZE_NEAREST_ROUNDING, WEIGHT_QUANTIZE_STOCHASTIC_ROUNDING], f"Invalid weight quantize rounding. Supported types: [{WEIGHT_QUANTIZE_NEAREST_ROUNDING}, {WEIGHT_QUANTIZE_STOCHASTIC_ROUNDING}]"
+        if WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE in sub_param_dict.keys():
+            output[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = get_scalar_param(
+                sub_param_dict[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE],
+                WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED,
+                WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT)
+            output[WEIGHT_QUANTIZE_CHANGE_RATIO] = get_scalar_param(
+                sub_param_dict[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE],
+                WEIGHT_QUANTIZE_CHANGE_RATIO,
+                WEIGHT_QUANTIZE_CHANGE_RATIO_DEFAULT)
+        else:
+            output[
+                WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT
+            output[WEIGHT_QUANTIZE_CHANGE_RATIO] = WEIGHT_QUANTIZE_CHANGE_RATIO_DEFAULT
+    else:
+        output[WEIGHT_QUANTIZE_ENABLED] = WEIGHT_QUANTIZE_ENABLED_DEFAULT
+        output[WEIGHT_QUANTIZE_KERNEL] = WEIGHT_QUANTIZE_KERNEL_DEFAULT
+        output[WEIGHT_QUANTIZE_SCHEDULE_OFFSET] = WEIGHT_QUANTIZE_SCHEDULE_OFFSET_DEFAULT
+        output[WEIGHT_QUANTIZE_GROUPS] = WEIGHT_QUANTIZE_GROUPS_DEFAULT
+        output[WEIGHT_QUANTIZE_VERBOSE] = WEIGHT_QUANTIZE_VERBOSE_DEFAULT
+        output[WEIGHT_QUANTIZE_TYPE] = WEIGHT_QUANTIZE_TYPE_DEFAULT
+        output[WEIGHT_QUANTIZE_ROUNDING] = WEIGHT_QUANTIZE_ROUNDING_DEFAULT
+        output[
+            WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT
+        output[WEIGHT_QUANTIZE_CHANGE_RATIO] = WEIGHT_QUANTIZE_CHANGE_RATIO_DEFAULT
+    return output
+
+
+def get_weight_quantization_different_groups(param_dict):
+    output = {}
+    sub_param_dict = param_dict[DIFFERENT_GROUPS]
+
+    def get_params(name, group_dict):
+        assert WEIGHT_QUANTIZE_START_BITS in group_dict.keys(), f"{WEIGHT_QUANTIZE_START_BITS} must be specified for weight quantization group {name}"
+        assert WEIGHT_QUANTIZE_TARGET_BITS in group_dict.keys(), f"{WEIGHT_QUANTIZE_TARGET_BITS} must be specified for weight quantization group {name}"
+        group_dict[WEIGHT_QUANTIZATION_PERIOD] = get_scalar_param(
+            group_dict,
+            WEIGHT_QUANTIZATION_PERIOD,
+            WEIGHT_QUANTIZATION_PERIOD_DEFAULT)
+        return group_dict
+
+    for k, v in sub_param_dict.items():
+        output[k] = {}
+        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(
+            k,
+            sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
+        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(
+            sub_param_dict[k],
+            DIFFERENT_GROUPS_MODULE_SCOPE,
+            DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
+        output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param(
+            sub_param_dict[k],
+            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE,
+            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
+
+    return output
+
+
+def get_activation_quantization(param_dict):
+    output = {}
+    if ACTIVATION_QUANTIZATION not in param_dict.keys():
+        param_dict[ACTIVATION_QUANTIZATION] = {
+            SHARED_PARAMETERS: {},
+            DIFFERENT_GROUPS: {}
+        }
+    sub_param_dict = param_dict[ACTIVATION_QUANTIZATION]
+    # shared parameters
+    output[SHARED_PARAMETERS] = get_activation_quantization_shared_parameters(
+        sub_param_dict)
+    # each sub-groups
+    if output[SHARED_PARAMETERS][ACTIVATION_QUANTIZATION_ENABLED]:
+        assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Activation Quantization is enabled, {DIFFERENT_GROUPS} must be specified"
+    output[DIFFERENT_GROUPS] = get_activation_quantization_different_groups(
+        sub_param_dict)
+    return output
+
+
+def get_activation_quantization_shared_parameters(param_dict):
+    output = {}
+    if SHARED_PARAMETERS in param_dict.keys():
+        sub_param_dict = param_dict[SHARED_PARAMETERS]
+        output[ACTIVATION_QUANTIZATION_ENABLED] = get_scalar_param(
+            sub_param_dict,
+            ACTIVATION_QUANTIZATION_ENABLED,
+            ACTIVATION_QUANTIZATION_ENABLED_DEFAULT)
+        output[ACTIVATION_QUANTIZE_TYPE] = get_scalar_param(
+            sub_param_dict,
+            ACTIVATION_QUANTIZE_TYPE,
+            ACTIVATION_QUANTIZE_TYPE_DEFAULT)
+        assert output[ACTIVATION_QUANTIZE_TYPE] in [ACTIVATION_QUANTIZE_SYMMETRIC, ACTIVATION_QUANTIZE_ASYMMETRIC], f"Invalid activation quantize type. Supported types: [{ACTIVATION_QUANTIZE_SYMMETRIC}, {ACTIVATION_QUANTIZE_ASYMMETRIC}]"
+        output[ACTIVATION_QUANTIZE_RANGE] = get_scalar_param(
+            sub_param_dict,
+            ACTIVATION_QUANTIZE_RANGE,
+            ACTIVATION_QUANTIZE_RANGE_DEFAULT)
+        assert output[ACTIVATION_QUANTIZE_RANGE] in [ACTIVATION_QUANTIZE_RANGE_DYNAMIC, ACTIVATION_QUANTIZE_RANGE_STATIC], f"Invalid activation quantize range calibration. Supported types: [{ACTIVATION_QUANTIZE_RANGE_DYNAMIC}, {ACTIVATION_QUANTIZE_RANGE_STATIC}]"
+        output[ACTIVATION_QUANTIZE_SCHEDULE_OFFSET] = get_scalar_param(
+            sub_param_dict,
+            ACTIVATION_QUANTIZE_SCHEDULE_OFFSET,
+            ACTIVATION_QUANTIZE_SCHEDULE_OFFSET_DEFAULT)
+    else:
+        output[ACTIVATION_QUANTIZATION_ENABLED] = ACTIVATION_QUANTIZATION_ENABLED_DEFAULT
+        output[ACTIVATION_QUANTIZE_TYPE] = ACTIVATION_QUANTIZE_TYPE_DEFAULT
+        output[ACTIVATION_QUANTIZE_RANGE] = ACTIVATION_QUANTIZE_RANGE_DEFAULT
+        output[
+            ACTIVATION_QUANTIZE_SCHEDULE_OFFSET] = ACTIVATION_QUANTIZE_SCHEDULE_OFFSET_DEFAULT
+    return output
+
+
+def get_activation_quantization_different_groups(param_dict):
+    output = {}
+    sub_param_dict = param_dict[DIFFERENT_GROUPS]
+
+    def get_params(name, group_dict):
+        assert ACTIVATION_QUANTIZE_BITS in group_dict.keys(), f"{ACTIVATION_QUANTIZE_BITS} must be specified for activation quantization group {name}"
+        return group_dict
+
+    for k, v in sub_param_dict.items():
+        output[k] = {}
+        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(
+            k,
+            sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
+        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(
+            sub_param_dict[k],
+            DIFFERENT_GROUPS_MODULE_SCOPE,
+            DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
+        output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param(
+            sub_param_dict[k],
+            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE,
+            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
+
+    return output
+
+
+def get_sparse_pruning(param_dict):
+    output = {}
+    if SPARSE_PRUNING not in param_dict.keys():
+        param_dict[SPARSE_PRUNING] = {SHARED_PARAMETERS: {}, DIFFERENT_GROUPS: {}}
+    sub_param_dict = param_dict[SPARSE_PRUNING]
+    # shared parameters
+    output[SHARED_PARAMETERS] = get_sparse_pruning_shared_parameters(sub_param_dict)
+    # each sub-groups
+    if output[SHARED_PARAMETERS][SPARSE_PRUNING_ENABLED]:
+        assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Sparse Pruning is enabled, {DIFFERENT_GROUPS} must be specified"
+    output[DIFFERENT_GROUPS] = get_sparse_pruning_different_groups(sub_param_dict)
+    return output
+
+
+def get_sparse_pruning_shared_parameters(param_dict):
+    output = {}
+    if SHARED_PARAMETERS in param_dict.keys():
+        sub_param_dict = param_dict[SHARED_PARAMETERS]
+        output[SPARSE_PRUNING_ENABLED] = get_scalar_param(
+            sub_param_dict,
+            SPARSE_PRUNING_ENABLED,
+            SPARSE_PRUNING_ENABLED_DEFAULT)
+        output[SPARSE_PRUNING_METHOD] = get_scalar_param(sub_param_dict,
+                                                         SPARSE_PRUNING_METHOD,
+                                                         SPARSE_PRUNING_METHOD_DEFAULT)
+        assert output[SPARSE_PRUNING_METHOD] in [SPARSE_PRUNING_METHOD_L1, SPARSE_PRUNING_METHOD_TOPK], f"Invalid sparse pruning method. Supported types: [{SPARSE_PRUNING_METHOD_L1}, {SPARSE_PRUNING_METHOD_TOPK}]"
+        output[SPARSE_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(
+            sub_param_dict,
+            SPARSE_PRUNING_SCHEDULE_OFFSET,
+            SPARSE_PRUNING_SCHEDULE_OFFSET_DEFAULT)
+    else:
+        output[SPARSE_PRUNING_ENABLED] = SPARSE_PRUNING_ENABLED_DEFAULT
+        output[SPARSE_PRUNING_METHOD] = SPARSE_PRUNING_METHOD_DEFAULT
+        output[SPARSE_PRUNING_SCHEDULE_OFFSET] = SPARSE_PRUNING_SCHEDULE_OFFSET_DEFAULT
+    return output
+
+
+def get_sparse_pruning_different_groups(param_dict):
+    output = {}
+    sub_param_dict = param_dict[DIFFERENT_GROUPS]
+
+    def get_params(name, group_dict):
+        assert SPARSE_PRUNING_DENSE_RATIO in group_dict.keys(), f"{SPARSE_PRUNING_DENSE_RATIO} must be specified for sparse pruning group {name}"
+        return group_dict
+
+    for k, v in sub_param_dict.items():
+        output[k] = {}
+        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(
+            k,
+            sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
+        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(
+            sub_param_dict[k],
+            DIFFERENT_GROUPS_MODULE_SCOPE,
+            DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
+        output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param(
+            sub_param_dict[k],
+            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE,
+            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
+
+    return output
+
+
+def get_row_pruning(param_dict):
+    output = {}
+    if ROW_PRUNING not in param_dict.keys():
+        param_dict[ROW_PRUNING] = {SHARED_PARAMETERS: {}, DIFFERENT_GROUPS: {}}
+    sub_param_dict = param_dict[ROW_PRUNING]
+    # shared parameters
+    output[SHARED_PARAMETERS] = get_row_pruning_shared_parameters(sub_param_dict)
+    # each sub-groups
+    if output[SHARED_PARAMETERS][ROW_PRUNING_ENABLED]:
+        assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Row Pruning is enabled, {DIFFERENT_GROUPS} must be specified"
+    output[DIFFERENT_GROUPS] = get_row_pruning_different_groups(sub_param_dict)
+    return output
+
+
+def get_row_pruning_shared_parameters(param_dict):
+    output = {}
+    if SHARED_PARAMETERS in param_dict.keys():
+        sub_param_dict = param_dict[SHARED_PARAMETERS]
+        output[ROW_PRUNING_ENABLED] = get_scalar_param(sub_param_dict,
+                                                       ROW_PRUNING_ENABLED,
+                                                       ROW_PRUNING_ENABLED_DEFAULT)
+        output[ROW_PRUNING_METHOD] = get_scalar_param(sub_param_dict,
+                                                      ROW_PRUNING_METHOD,
+                                                      ROW_PRUNING_METHOD_DEFAULT)
+        assert output[ROW_PRUNING_METHOD] in [ROW_PRUNING_METHOD_L1, ROW_PRUNING_METHOD_TOPK], f"Invalid row pruning method. Supported types: [{ROW_PRUNING_METHOD_L1}, {ROW_PRUNING_METHOD_TOPK}]"
+        output[ROW_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(
+            sub_param_dict,
+            ROW_PRUNING_SCHEDULE_OFFSET,
+            ROW_PRUNING_SCHEDULE_OFFSET_DEFAULT)
+    else:
+        output[ROW_PRUNING_ENABLED] = ROW_PRUNING_ENABLED_DEFAULT
+        output[ROW_PRUNING_METHOD] = ROW_PRUNING_METHOD_DEFAULT
+        output[ROW_PRUNING_SCHEDULE_OFFSET] = ROW_PRUNING_SCHEDULE_OFFSET_DEFAULT
+    return output
+
+
+def get_row_pruning_different_groups(param_dict):
+    output = {}
+    sub_param_dict = param_dict[DIFFERENT_GROUPS]
+
+    def get_params(name, group_dict):
+        assert ROW_PRUNING_DENSE_RATIO in group_dict.keys(), f"{ROW_PRUNING_DENSE_RATIO} must be specified for row pruning group {name}"
+        return group_dict
+
+    for k, v in sub_param_dict.items():
+        output[k] = {}
+        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(
+            k,
+            sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
+        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(
+            sub_param_dict[k],
+            DIFFERENT_GROUPS_MODULE_SCOPE,
+            DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
+        output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param(
+            sub_param_dict[k],
+            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE,
+            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
+    return output
+
+
+def get_head_pruning(param_dict):
+    output = {}
+    if HEAD_PRUNING not in param_dict.keys():
+        param_dict[HEAD_PRUNING] = {SHARED_PARAMETERS: {}, DIFFERENT_GROUPS: {}}
+    sub_param_dict = param_dict[HEAD_PRUNING]
+    # shared parameters
+    output[SHARED_PARAMETERS] = get_head_pruning_shared_parameters(sub_param_dict)
+    # each sub-groups
+    if output[SHARED_PARAMETERS][HEAD_PRUNING_ENABLED]:
+        assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Head Pruning is enabled, {DIFFERENT_GROUPS} must be specified"
+    output[DIFFERENT_GROUPS] = get_head_pruning_different_groups(sub_param_dict)
+    return output
+
+
+def get_head_pruning_shared_parameters(param_dict):
+    output = {}
+    if SHARED_PARAMETERS in param_dict.keys():
+        sub_param_dict = param_dict[SHARED_PARAMETERS]
+        output[HEAD_PRUNING_ENABLED] = get_scalar_param(sub_param_dict,
+                                                        HEAD_PRUNING_ENABLED,
+                                                        HEAD_PRUNING_ENABLED_DEFAULT)
+        output[HEAD_PRUNING_METHOD] = get_scalar_param(sub_param_dict,
+                                                       HEAD_PRUNING_METHOD,
+                                                       HEAD_PRUNING_METHOD_DEFAULT)
+        assert output[HEAD_PRUNING_METHOD] in [HEAD_PRUNING_METHOD_L1, HEAD_PRUNING_METHOD_TOPK], f"Invalid head pruning method. Supported types: [{HEAD_PRUNING_METHOD_L1}, {HEAD_PRUNING_METHOD_TOPK}]"
+        output[HEAD_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(
+            sub_param_dict,
+            HEAD_PRUNING_SCHEDULE_OFFSET,
+            HEAD_PRUNING_SCHEDULE_OFFSET_DEFAULT)
+        if output[HEAD_PRUNING_ENABLED]:
+            assert HEAD_PRUNING_NUM_HEADS in sub_param_dict.keys(), f"{HEAD_PRUNING_NUM_HEADS} must be specified for head pruning"
+            output[HEAD_PRUNING_NUM_HEADS] = sub_param_dict[HEAD_PRUNING_NUM_HEADS]
+    else:
+        output[HEAD_PRUNING_ENABLED] = HEAD_PRUNING_ENABLED_DEFAULT
+        output[HEAD_PRUNING_METHOD] = HEAD_PRUNING_METHOD_DEFAULT
+        output[HEAD_PRUNING_SCHEDULE_OFFSET] = HEAD_PRUNING_SCHEDULE_OFFSET_DEFAULT
+    return output
+
+
+def get_head_pruning_different_groups(param_dict):
+    output = {}
+    sub_param_dict = param_dict[DIFFERENT_GROUPS]
+
+    def get_params(name, group_dict):
+        assert HEAD_PRUNING_DENSE_RATIO in group_dict.keys(), f"dense_ratio must be specified for head pruning group {name}"
+        return group_dict
+
+    for k, v in sub_param_dict.items():
+        output[k] = {}
+        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(
+            k,
+            sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
+        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(
+            sub_param_dict[k],
+            DIFFERENT_GROUPS_MODULE_SCOPE,
+            DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
+        output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param(
+            sub_param_dict[k],
+            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE,
+            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
+    return output
+
+
+def get_channel_pruning(param_dict):
+    output = {}
+    if CHANNEL_PRUNING not in param_dict.keys():
+        param_dict[CHANNEL_PRUNING] = {SHARED_PARAMETERS: {}, DIFFERENT_GROUPS: {}}
+    sub_param_dict = param_dict[CHANNEL_PRUNING]
+    # shared parameters
+    output[SHARED_PARAMETERS] = get_channel_pruning_shared_parameters(sub_param_dict)
+    # each sub-groups
+    if output[SHARED_PARAMETERS][CHANNEL_PRUNING_ENABLED]:
+        assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Sparse Pruning is enabled, {DIFFERENT_GROUPS} must be specified"
+    output[DIFFERENT_GROUPS] = get_channel_pruning_different_groups(sub_param_dict)
+    return output
+
+
+def get_channel_pruning_shared_parameters(param_dict):
+    output = {}
+    if SHARED_PARAMETERS in param_dict.keys():
+        sub_param_dict = param_dict[SHARED_PARAMETERS]
+        output[CHANNEL_PRUNING_ENABLED] = get_scalar_param(
+            sub_param_dict,
+            CHANNEL_PRUNING_ENABLED,
+            CHANNEL_PRUNING_ENABLED_DEFAULT)
+        output[CHANNEL_PRUNING_METHOD] = get_scalar_param(
+            sub_param_dict,
+            CHANNEL_PRUNING_METHOD,
+            CHANNEL_PRUNING_METHOD_DEFAULT)
+        assert output[CHANNEL_PRUNING_METHOD] in [CHANNEL_PRUNING_METHOD_L1, CHANNEL_PRUNING_METHOD_TOPK], f"Invalid channel pruning method. Supported types: [{CHANNEL_PRUNING_METHOD_L1}, {CHANNEL_PRUNING_METHOD_TOPK}]"
+        output[CHANNEL_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(
+            sub_param_dict,
+            CHANNEL_PRUNING_SCHEDULE_OFFSET,
+            CHANNEL_PRUNING_SCHEDULE_OFFSET_DEFAULT)
+    else:
+        output[CHANNEL_PRUNING_ENABLED] = CHANNEL_PRUNING_ENABLED_DEFAULT
+        output[CHANNEL_PRUNING_METHOD] = CHANNEL_PRUNING_METHOD_DEFAULT
+        output[CHANNEL_PRUNING_SCHEDULE_OFFSET] = CHANNEL_PRUNING_SCHEDULE_OFFSET_DEFAULT
+    return output
+
+
+def get_channel_pruning_different_groups(param_dict):
+    output = {}
+    sub_param_dict = param_dict[DIFFERENT_GROUPS]
+
+    def get_params(name, group_dict):
+        assert CHANNEL_PRUNING_DENSE_RATIO in group_dict.keys(), f"{CHANNEL_PRUNING_DENSE_RATIO} must be specified for channel pruning group {name}"
+        return group_dict
+
+    for k, v in sub_param_dict.items():
+        output[k] = {}
+        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(
+            k,
+            sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
+        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(
+            sub_param_dict[k],
+            DIFFERENT_GROUPS_MODULE_SCOPE,
+            DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
+        output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param(
+            sub_param_dict[k],
+            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE,
+            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
+
+    return output
diff --git a/deepspeed/compression/constants.py b/deepspeed/compression/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..593b86e5f5c9cac03edee0d87298aad8d09d41e9
--- /dev/null
+++ b/deepspeed/compression/constants.py
@@ -0,0 +1,170 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+#########################################
+# Compression Methods
+# It has several sub-components
+# #########################################
+COMPRESSION_TRAINING = "compression_training"
+SHARED_PARAMETERS = "shared_parameters"
+DIFFERENT_GROUPS = "different_groups"
+TECHNIQUE_ENABLED = "enabled"
+TECHNIQUE_SCHEDULE_OFFSET = "schedule_offset"
+DIFFERENT_GROUPS_PARAMETERS = "params"
+DIFFERENT_GROUPS_MODULE_SCOPE = "modules"
+DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT = "*"
+DIFFERENT_GROUPS_RELATED_MODULE_SCOPE = "related_modules"
+DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT = None
+# COMPRESSION_TRAINING_ENABLED = "enabled"
+# COMPRESSION_TRAINING_ENABLED_DEFAULT = False
+
+####
+# Layer Reduction
+####
+LAYER_REDUCTION = "layer_reduction"
+LAYER_REDUCTION_ENABLED = "enabled"
+LAYER_REDUCTION_ENABLED_DEFAULT = False
+KEEP_NUMBER_LAYER = "keep_number_layer"
+MODULE_NAME_PREFIX = "module_name_prefix"
+TEACHER_LAYER = "teacher_layer"
+OTHER_MODULE_NAME = "other_module_name"
+
+####
+# Weight Quantzation
+####
+WEIGHT_QUANTIZATION = "weight_quantization"
+
+WEIGHT_QUANTIZATION_PERIOD = "quantization_period"
+WEIGHT_QUANTIZATION_PERIOD_DEFAULT = 1
+
+WEIGHT_QUANTIZE_IN_FORWARD_ENABLED = "quantize_weight_in_forward"
+WEIGHT_QUANTIZE_IN_FORWARD_ENABLED_DEFAULT = False
+
+WEIGHT_QUANTIZE_ENABLED = TECHNIQUE_ENABLED
+WEIGHT_QUANTIZE_ENABLED_DEFAULT = False
+
+WEIGHT_QUANTIZE_KERNEL = "quantizer_kernel"
+WEIGHT_QUANTIZE_KERNEL_DEFAULT = False
+
+WEIGHT_QUANTIZE_SCHEDULE_OFFSET = TECHNIQUE_SCHEDULE_OFFSET
+WEIGHT_QUANTIZE_SCHEDULE_OFFSET_DEFAULT = 0
+
+WEIGHT_QUANTIZE_GROUPS = "quantize_groups"
+WEIGHT_QUANTIZE_GROUPS_DEFAULT = 1
+
+WEIGHT_QUANTIZE_VERBOSE = "quantize_verbose"
+WEIGHT_QUANTIZE_VERBOSE_DEFAULT = False
+
+WEIGHT_QUANTIZE_TYPE = "quantization_type"
+WEIGHT_QUANTIZE_TYPE_DEFAULT = "symmetric"
+WEIGHT_QUANTIZE_SYMMETRIC = "symmetric"
+WEIGHT_QUANTIZE_ASYMMETRIC = "asymmetric"
+
+WEIGHT_QUANTIZE_ROUNDING = "rounding"
+WEIGHT_QUANTIZE_ROUNDING_DEFAULT = "nearest"
+WEIGHT_QUANTIZE_STOCHASTIC_ROUNDING = "stochastic"
+WEIGHT_QUANTIZE_NEAREST_ROUNDING = "nearest"
+# maybe deleted for a cleaner version
+WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE = "fp16_mixed_quantize"
+
+WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED = "enabled"
+WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT = False
+
+WEIGHT_QUANTIZE_CHANGE_RATIO = "quantize_change_ratio"
+WEIGHT_QUANTIZE_CHANGE_RATIO_DEFAULT = 0.001
+
+WEIGHT_QUANTIZE_START_BITS = "start_bits"
+WEIGHT_QUANTIZE_TARGET_BITS = "target_bits"
+###
+# Activation Quantization
+###
+ACTIVATION_QUANTIZATION = "activation_quantization"
+
+ACTIVATION_QUANTIZATION_ENABLED = TECHNIQUE_ENABLED
+ACTIVATION_QUANTIZATION_ENABLED_DEFAULT = False
+
+ACTIVATION_QUANTIZE_SCHEDULE_OFFSET = TECHNIQUE_SCHEDULE_OFFSET
+ACTIVATION_QUANTIZE_SCHEDULE_OFFSET_DEFAULT = 1000
+
+ACTIVATION_QUANTIZE_TYPE = "quantization_type"
+ACTIVATION_QUANTIZE_TYPE_DEFAULT = "symmetric"
+ACTIVATION_QUANTIZE_SYMMETRIC = "symmetric"
+ACTIVATION_QUANTIZE_ASYMMETRIC = "asymmetric"
+
+ACTIVATION_QUANTIZE_RANGE = 'range_calibration'
+ACTIVATION_QUANTIZE_RANGE_DEFAULT = 'dynamic'
+ACTIVATION_QUANTIZE_RANGE_STATIC = 'static'
+ACTIVATION_QUANTIZE_RANGE_DYNAMIC = 'dynamic'
+
+ACTIVATION_QUANTIZE_BITS = "bits"
+###
+# Sparse Pruning
+###
+SPARSE_PRUNING = "sparse_pruning"
+
+SPARSE_PRUNING_ENABLED = TECHNIQUE_ENABLED
+SPARSE_PRUNING_ENABLED_DEFAULT = False
+
+SPARSE_PRUNING_METHOD = "method"
+SPARSE_PRUNING_METHOD_DEFAULT = "l1"
+SPARSE_PRUNING_METHOD_L1 = "l1"
+SPARSE_PRUNING_METHOD_TOPK = "topk"
+
+SPARSE_PRUNING_SCHEDULE_OFFSET = TECHNIQUE_SCHEDULE_OFFSET
+SPARSE_PRUNING_SCHEDULE_OFFSET_DEFAULT = 1000
+
+SPARSE_PRUNING_DENSE_RATIO = "dense_ratio"
+###
+# Row Pruning
+###
+ROW_PRUNING = "row_pruning"
+
+ROW_PRUNING_ENABLED = TECHNIQUE_ENABLED
+ROW_PRUNING_ENABLED_DEFAULT = False
+
+ROW_PRUNING_METHOD = "method"
+ROW_PRUNING_METHOD_DEFAULT = "l1"
+ROW_PRUNING_METHOD_L1 = "l1"
+ROW_PRUNING_METHOD_TOPK = "topk"
+
+ROW_PRUNING_SCHEDULE_OFFSET = TECHNIQUE_SCHEDULE_OFFSET
+ROW_PRUNING_SCHEDULE_OFFSET_DEFAULT = 1000
+
+ROW_PRUNING_DENSE_RATIO = "dense_ratio"
+
+###
+# Head Pruning
+###
+HEAD_PRUNING = "head_pruning"
+
+HEAD_PRUNING_ENABLED = TECHNIQUE_ENABLED
+HEAD_PRUNING_ENABLED_DEFAULT = False
+
+HEAD_PRUNING_METHOD = "method"
+HEAD_PRUNING_METHOD_DEFAULT = "topk"
+HEAD_PRUNING_METHOD_L1 = "l1"
+HEAD_PRUNING_METHOD_TOPK = "topk"
+
+HEAD_PRUNING_SCHEDULE_OFFSET = TECHNIQUE_SCHEDULE_OFFSET
+HEAD_PRUNING_SCHEDULE_OFFSET_DEFAULT = 1000
+
+HEAD_PRUNING_NUM_HEADS = "num_heads"
+
+HEAD_PRUNING_DENSE_RATIO = "dense_ratio"
+
+###
+# Channel Pruning
+###
+CHANNEL_PRUNING = "channel_pruning"
+
+CHANNEL_PRUNING_ENABLED = TECHNIQUE_ENABLED
+CHANNEL_PRUNING_ENABLED_DEFAULT = False
+
+CHANNEL_PRUNING_METHOD = "method"
+CHANNEL_PRUNING_METHOD_DEFAULT = "l1"
+CHANNEL_PRUNING_METHOD_L1 = "l1"
+CHANNEL_PRUNING_METHOD_TOPK = "topk"
+
+CHANNEL_PRUNING_SCHEDULE_OFFSET = TECHNIQUE_SCHEDULE_OFFSET
+CHANNEL_PRUNING_SCHEDULE_OFFSET_DEFAULT = 1000
+
+CHANNEL_PRUNING_DENSE_RATIO = "dense_ratio"
diff --git a/deepspeed/compression/helper.py b/deepspeed/compression/helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..e839a5d035823725266f7ca85cc8ab693e91de53
--- /dev/null
+++ b/deepspeed/compression/helper.py
@@ -0,0 +1,283 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from .basic_layer import Embedding_Compress, LinearLayer_Compress, Conv2dLayer_Compress, BNLayer_Compress, ColumnParallelLinear_Compress, RowParallelLinear_Compress
+from .constants import *
+
+
+def recursive_getattr(model, module_name):
+    """
+    Recursively get the attribute of a module.
+    Args:
+        model (`torch.nn.Module`)
+            The model to get the attribute from.
+        module_name (`str`)
+            The name of the module to get the attribute from.
+    """
+    split_list = module_name.split('.')
+    output = model
+    for name in split_list:
+        output = getattr(output, name)
+    return output
+
+
+def recursive_setattr(model, module_name, module):
+    """
+    Recursively set the attribute of a module.
+    Args:
+        model (`torch.nn.Module`)
+            The model to set the attribute in.
+        module_name (`str`)
+            The name of the module to set the attribute in.
+        module (`torch.nn.Module`)
+            The module to set the attribute to.
+    """
+    split_list = module_name.split('.')
+    output = model
+    for name in split_list[:-1]:
+        output = getattr(output, name)
+    output.__setattr__(split_list[-1], module)
+
+
+def module_replacement(model, module_name, compression_technique=None, mpu=None):
+    """
+    Replace a module with a new module.
+    Args:
+        model (`torch.nn.Module`)
+            The model to replace the module in.
+        module_name (`str`)
+            The name of the module to replace.
+        compression_technique (`str`)
+            The compression technique to use for the new module.
+    """
+
+    # Get the old module
+    old_module = recursive_getattr(model, module_name)
+
+    need_bias = False
+    if hasattr(old_module, 'bias') and old_module.bias is not None:
+        need_bias = True
+
+    # Initialize the new module
+    if isinstance(old_module,
+                  LinearLayer_Compress) or isinstance(old_module,
+                                                      torch.nn.Linear):
+        if isinstance(old_module, LinearLayer_Compress):
+            new_module = old_module
+        else:
+            new_module = LinearLayer_Compress(old_module.in_features,
+                                              old_module.out_features,
+                                              bias=need_bias).to(
+                                                  device=old_module.weight.device,
+                                                  dtype=old_module.weight.dtype)
+            new_module.weight.data = old_module.weight.data
+            if need_bias:
+                new_module.bias.data = old_module.bias.data
+    elif isinstance(old_module,
+                    Conv2dLayer_Compress) or isinstance(old_module,
+                                                        torch.nn.Conv2d):
+        if isinstance(old_module, Conv2dLayer_Compress):
+            new_module = old_module
+        else:
+            new_module = Conv2dLayer_Compress(old_module.in_channels, old_module.out_channels, old_module.kernel_size, old_module.stride, old_module.padding, \
+                                            old_module.dilation, old_module.groups, need_bias, \
+                                            old_module.padding_mode).to(device=old_module.weight.device, dtype=old_module.weight.dtype)
+            new_module.weight.data = old_module.weight.data
+            if need_bias:
+                new_module.bias.data = old_module.bias.data
+    elif isinstance(old_module, torch.nn.BatchNorm2d):
+        new_module = BNLayer_Compress(old_module.num_features,
+                                      old_module.eps,
+                                      old_module.momentum,
+                                      old_module.affine,
+                                      old_module.track_running_stats).to(
+                                          old_module.weight.device,
+                                          old_module.weight.dtype)
+        new_module.weight.data = old_module.weight.data
+        if need_bias:
+            new_module.bias.data = old_module.bias.data
+        new_module.running_mean.data = old_module.running_mean.data
+        new_module.running_var.data = old_module.running_var.data
+    elif isinstance(old_module,
+                    Embedding_Compress) or isinstance(old_module,
+                                                      torch.nn.Embedding):
+        if isinstance(old_module, Embedding_Compress):
+            new_module = old_module
+        else:
+            new_module = Embedding_Compress(old_module.num_embeddings, old_module.embedding_dim, old_module.padding_idx, old_module.max_norm, old_module.norm_type, \
+                                        old_module.scale_grad_by_freq, old_module.sparse).to(device=old_module.weight.device, dtype=old_module.weight.dtype)
+            new_module.weight.data = old_module.weight.data
+    elif mpu is not None and (isinstance(old_module,
+                                         ColumnParallelLinear_Compress)
+                              or isinstance(old_module,
+                                            mpu.ColumnParallelLinear)):
+        if isinstance(old_module, ColumnParallelLinear_Compress):
+            new_module = old_module
+        else:
+            new_module = ColumnParallelLinear_Compress(
+                mpu,
+                old_module.input_size,
+                old_module.output_size,
+                gather_output=old_module.gather_output,
+                skip_bias_add=old_module.skip_bias_add,
+                bias=need_bias).to(device=old_module.weight.device,
+                                   dtype=old_module.weight.dtype)
+            new_module.weight.data = old_module.weight.data
+            if need_bias:
+                new_module.bias.data = old_module.bias.data
+    elif mpu is not None and (isinstance(old_module,
+                                         RowParallelLinear_Compress)
+                              or isinstance(old_module,
+                                            mpu.RowParallelLinear)):
+        if isinstance(old_module, RowParallelLinear_Compress):
+            new_module = old_module
+        else:
+            new_module = RowParallelLinear_Compress(
+                mpu,
+                old_module.input_size,
+                old_module.output_size,
+                input_is_parallel=old_module.input_is_parallel,
+                skip_bias_add=old_module.skip_bias_add,
+                bias=need_bias).to(device=old_module.weight.device,
+                                   dtype=old_module.weight.dtype)
+            new_module.weight.data = old_module.weight.data
+            if need_bias:
+                new_module.bias.data = old_module.bias.data
+    else:
+        new_module = None
+
+    if compression_technique is not None:
+        for k, v in compression_technique.items():
+            if k == SPARSE_PRUNING:
+                if v[SPARSE_PRUNING_ENABLED]:
+                    new_module.enable_sparse_pruning(v[SPARSE_PRUNING_DENSE_RATIO],
+                                                     v[SPARSE_PRUNING_METHOD])
+            elif k == ROW_PRUNING:
+                if v[ROW_PRUNING_ENABLED]:
+                    new_module.enable_row_pruning(v[ROW_PRUNING_DENSE_RATIO],
+                                                  v[ROW_PRUNING_METHOD])
+            elif k == HEAD_PRUNING:
+                if v[HEAD_PRUNING_ENABLED]:
+                    new_module.enable_head_pruning(v[HEAD_PRUNING_DENSE_RATIO],
+                                                   v[HEAD_PRUNING_METHOD],
+                                                   v[HEAD_PRUNING_NUM_HEADS])
+            elif k == ACTIVATION_QUANTIZATION:
+                if v[ACTIVATION_QUANTIZATION_ENABLED]:
+                    new_module.enable_activation_quantization(
+                        v[ACTIVATION_QUANTIZE_BITS],
+                        v[ACTIVATION_QUANTIZE_TYPE],
+                        v[ACTIVATION_QUANTIZE_RANGE])
+            elif k == WEIGHT_QUANTIZATION:
+                if v[WEIGHT_QUANTIZE_ENABLED]:
+                    new_module.enable_weight_quantization(
+                        v[WEIGHT_QUANTIZE_START_BITS],
+                        v[WEIGHT_QUANTIZE_TARGET_BITS],
+                        v[WEIGHT_QUANTIZATION_PERIOD],
+                        v[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED],
+                        v[WEIGHT_QUANTIZE_TYPE],
+                        v[WEIGHT_QUANTIZE_GROUPS])
+            elif k == CHANNEL_PRUNING:
+                if v[CHANNEL_PRUNING_ENABLED]:
+                    new_module.enable_channel_pruning(v[CHANNEL_PRUNING_DENSE_RATIO],
+                                                      v[CHANNEL_PRUNING_METHOD])
+            else:
+                raise NotImplementedError(
+                    'Compression technique {} is not implemented'.format(k))
+
+    # Replace the old module with the new one
+    recursive_setattr(model, module_name, new_module)
+
+
+def is_module_compressible(module, mpu=None):
+    ret = isinstance(module, torch.nn.Linear) or \
+          isinstance(module, torch.nn.Conv2d) or \
+          isinstance(module, torch.nn.Embedding) or \
+          isinstance(module, torch.nn.BatchNorm2d)
+
+    if mpu is not None:
+        ret = ret or isinstance(module,
+                                mpu.RowParallelLinear) or isinstance(
+                                    module,
+                                    mpu.ColumnParallelLinear)
+
+    return ret
+
+
+def compression_preparation(model, compression_techinique_list, mpu):
+    """
+    Prepare the compression techniques of a model.
+    Args:
+        model (`torch.nn.Module`)
+            The model to prepare the compression techniques of.
+        compression_techinique_list (`list`)
+            The list of compression techniques to prepare the model to.
+            list[]
+    """
+    # Here we first replace all module with our linear wrapper
+    for module_name, module in model.named_modules():
+        if is_module_compressible(module, mpu):
+            module_replacement(model, module_name, mpu=mpu)
+    for module_name_lists, _, compression_technique in compression_techinique_list:
+        for mnl in module_name_lists:
+            for module_name in mnl:
+                module_replacement(model, module_name, compression_technique)
+
+    return model
+
+
+def fix_compression(model,
+                    module_name,
+                    compression_technique,
+                    mask=None,
+                    dim_reduction=False):
+    """
+    Fix the compression technique of a module.
+    Args:
+        model (`torch.nn.Module`)
+            The model to fix the compression technique of.
+        module_name (`str`)
+            The name of the module to fix the compression technique of.
+        compression_technique (`str`)
+            The compression technique to fix the module to.
+    """
+    # Here we can make things much simpler by just replacing the module
+    module = recursive_getattr(model, module_name)
+    for k, v in compression_technique.items():
+        if k == WEIGHT_QUANTIZATION and v[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED] and v[
+                WEIGHT_QUANTIZE_ENABLED]:
+            return module.fix_weight_quantization()
+        elif k == SPARSE_PRUNING and v[SPARSE_PRUNING_ENABLED]:
+            return module.fix_sparse_pruning_helper()
+        elif k == ROW_PRUNING and (v[ROW_PRUNING_ENABLED] or mask is not None):
+            return module.fix_row_col_pruning_helper(mask, dim_reduction=dim_reduction)
+        elif k == HEAD_PRUNING and (v[HEAD_PRUNING_ENABLED] or mask is not None):
+            return module.fix_head_pruning_helper(mask,
+                                                  v[HEAD_PRUNING_NUM_HEADS],
+                                                  dim_reduction=dim_reduction)
+        elif k == CHANNEL_PRUNING and (v[CHANNEL_PRUNING_ENABLED] or mask is not None):
+            return module.fix_channel_pruning_helper(mask, dim_reduction=dim_reduction)
+
+
+def convert_conv1d_to_linear(model, convert_type):
+    '''
+    This is a help function to convert conv1d to linear (e.g., convert GPT2 from HF)
+    '''
+    if hasattr(model, 'module'):
+        c_model = model.module
+    else:
+        c_model = model
+
+    for name, module in c_model.named_modules():
+        if isinstance(module, convert_type):
+            old_module = recursive_getattr(c_model, name)
+            new_module = torch.nn.Linear(
+                old_module.weight.data.size(0),
+                old_module.weight.data.size(1),
+                bias=True if old_module.bias is not None else False)
+            new_module.weight.data = old_module.weight.data.t().contiguous()
+            if new_module.bias is not None:
+                new_module.bias.data = old_module.bias.data.view(-1)
+
+            recursive_setattr(c_model, name, new_module)
+
+    return model
diff --git a/deepspeed/compression/scheduler.py b/deepspeed/compression/scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..67955a825251c468c361be7dce4e9fd61d02245e
--- /dev/null
+++ b/deepspeed/compression/scheduler.py
@@ -0,0 +1,173 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .compress import get_module_name
+from .constants import *
+from .helper import recursive_getattr
+from deepspeed.utils import logger
+
+
+class compression_scheduler():
+    '''
+    Used to schedule different compression methods
+    '''
+    def __init__(self, model, compression_config):
+        self.model = model
+        self.compression_config = compression_config
+        self.make_init()
+        self.training_steps = 0
+        self.weight_quantization_enabled = False
+
+        self.verbose = {
+            WEIGHT_QUANTIZATION: False,
+            ACTIVATION_QUANTIZATION: False,
+            SPARSE_PRUNING: False,
+            HEAD_PRUNING: False,
+            ROW_PRUNING: False,
+            CHANNEL_PRUNING: False
+        }
+
+    def make_init(self):
+        self.different_compression_methods = {}
+        for method, method_content in self.compression_config.items():
+            if LAYER_REDUCTION in method:
+                continue
+            self.different_compression_methods[method] = {
+                TECHNIQUE_ENABLED: False,
+                SHARED_PARAMETERS: None,
+                DIFFERENT_GROUPS: []
+            }
+            exist_module_name = set()
+            shared_parameters = method_content[SHARED_PARAMETERS]
+            self.different_compression_methods[method][
+                TECHNIQUE_ENABLED] = shared_parameters[TECHNIQUE_ENABLED]
+            self.different_compression_methods[method][
+                SHARED_PARAMETERS] = shared_parameters
+
+            for group_name, method_parameters in method_content[DIFFERENT_GROUPS].items():
+                module_name_list = []
+                for key_word in method_parameters[DIFFERENT_GROUPS_MODULE_SCOPE]:
+                    module_name, exist_module_name = get_module_name(group_name, self.model, key_word, exist_module_name, verbose=False)
+                    module_name_list.extend(module_name)
+                if module_name_list:
+                    self.different_compression_methods[method][DIFFERENT_GROUPS].append([
+                        group_name,
+                        module_name_list,
+                        method_parameters.copy().pop('params')
+                    ])
+
+    def check_weight_quantization(self):
+        # check weight quantization
+        wq = self.different_compression_methods[WEIGHT_QUANTIZATION]
+        if not wq[TECHNIQUE_ENABLED]:
+            return
+        else:
+            shared_parameters = wq[SHARED_PARAMETERS]
+            if self.training_steps >= shared_parameters[TECHNIQUE_SCHEDULE_OFFSET]:
+                for group_name, module_name_list, method_parameters in wq[DIFFERENT_GROUPS]:
+                    for module_name in module_name_list:
+                        module = recursive_getattr(self.model, module_name)
+                        module.weight_quantization_enabled = True
+
+                if not self.verbose[WEIGHT_QUANTIZATION]:
+                    logger.info(
+                        f'Weight quantization is enabled at step {self.training_steps}')
+                    self.weight_quantization_enabled = True
+                    self.verbose[WEIGHT_QUANTIZATION] = True
+
+    def check_activation_quantization(self):
+        # check activation quantization
+        aq = self.different_compression_methods[ACTIVATION_QUANTIZATION]
+        if not aq[TECHNIQUE_ENABLED]:
+            return
+        else:
+            shared_parameters = aq[SHARED_PARAMETERS]
+            if self.training_steps >= shared_parameters[TECHNIQUE_SCHEDULE_OFFSET]:
+                for group_name, module_name_list, method_parameters in aq[DIFFERENT_GROUPS]:
+                    for module_name in module_name_list:
+                        module = recursive_getattr(self.model, module_name)
+                        module.activation_quantization_enabled = True
+                if not self.verbose[ACTIVATION_QUANTIZATION]:
+                    logger.info(
+                        f'Activation quantization is enabled at step {self.training_steps}'
+                    )
+                    self.verbose[ACTIVATION_QUANTIZATION] = True
+
+    def check_sparse_pruning(self):
+        # check sparse pruning
+        sp = self.different_compression_methods[SPARSE_PRUNING]
+        if not sp[TECHNIQUE_ENABLED]:
+            return
+        else:
+            shared_parameters = sp[SHARED_PARAMETERS]
+            if self.training_steps >= shared_parameters[TECHNIQUE_SCHEDULE_OFFSET]:
+                for group_name, module_name_list, method_parameters in sp[DIFFERENT_GROUPS]:
+                    for module_name in module_name_list:
+                        module = recursive_getattr(self.model, module_name)
+                        module.sparse_pruning_enabled = True
+                if not self.verbose[SPARSE_PRUNING]:
+                    logger.info(
+                        f'Sparse pruning is enabled at step {self.training_steps}')
+                    self.verbose[SPARSE_PRUNING] = True
+
+    def check_head_pruning(self):
+        # check head pruning
+        hp = self.different_compression_methods[HEAD_PRUNING]
+        if not hp[TECHNIQUE_ENABLED]:
+            return
+        else:
+            shared_parameters = hp[SHARED_PARAMETERS]
+            if self.training_steps >= shared_parameters[TECHNIQUE_SCHEDULE_OFFSET]:
+                for group_name, module_name_list, method_parameters in hp[DIFFERENT_GROUPS]:
+                    for module_name in module_name_list:
+                        module = recursive_getattr(self.model, module_name)
+                        module.head_pruning_enabled = True
+                if not self.verbose[HEAD_PRUNING]:
+                    logger.info(f'Head pruning is enabled at step {self.training_steps}')
+                    self.verbose[HEAD_PRUNING] = True
+
+    def check_row_pruning(self):
+        # check row pruning
+        rp = self.different_compression_methods[ROW_PRUNING]
+        if not rp[TECHNIQUE_ENABLED]:
+            return
+        else:
+            shared_parameters = rp[SHARED_PARAMETERS]
+            if self.training_steps >= shared_parameters[TECHNIQUE_SCHEDULE_OFFSET]:
+                for group_name, module_name_list, method_parameters in rp[DIFFERENT_GROUPS]:
+                    for module_name in module_name_list:
+                        module = recursive_getattr(self.model, module_name)
+                        module.row_pruning_enabled = True
+                if not self.verbose[ROW_PRUNING]:
+                    logger.info(f'Row pruning is enabled at step {self.training_steps}')
+                    self.verbose[ROW_PRUNING] = True
+
+    def check_channel_pruning(self):
+        # check channel pruning
+        cp = self.different_compression_methods[CHANNEL_PRUNING]
+        if not cp[TECHNIQUE_ENABLED]:
+            return
+        else:
+            shared_parameters = cp[SHARED_PARAMETERS]
+            if self.training_steps >= shared_parameters[TECHNIQUE_SCHEDULE_OFFSET]:
+                for group_name, module_name_list, method_parameters in cp[DIFFERENT_GROUPS]:
+                    for module_name in module_name_list:
+                        module = recursive_getattr(self.model, module_name)
+                        module.channel_pruning_enabled = True
+                if not self.verbose[CHANNEL_PRUNING]:
+                    logger.info(
+                        f'Channel pruning is enabled at step {self.training_steps}')
+                    self.verbose[CHANNEL_PRUNING] = True
+
+    def check_all_modules(self):
+        # check all different compression methods we have
+        self.check_weight_quantization()
+        self.check_activation_quantization()
+        self.check_sparse_pruning()
+        self.check_head_pruning()
+        self.check_row_pruning()
+        self.check_channel_pruning()
+
+    def step(self, step_zero_check=False):
+        if not step_zero_check:
+            self.training_steps += 1
+        self.check_all_modules()
diff --git a/deepspeed/compression/utils.py b/deepspeed/compression/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2492c8ce9d861aa8beb2ed3cac267e4dcf8b50c3
--- /dev/null
+++ b/deepspeed/compression/utils.py
@@ -0,0 +1,218 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from torch import autograd
+import math
+
+
+class TopKBinarizer(autograd.Function):
+    """
+    Top-k Binarizer.
+    Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j}`
+    is among the k% highest values of S.
+    Implementation is inspired from:
+        https://github.com/yaozhewei/MLPruning
+    """
+    @staticmethod
+    def forward(ctx, inputs: torch.tensor, threshold: float, sigmoid: bool):
+        """
+        Args:
+            inputs (`torch.FloatTensor`)
+                The input matrix from which the binarizer computes the binary mask.
+            threshold (`float`)
+                The percentage of weights to keep (the rest is pruned).
+                `threshold` is a float between 0 and 1.
+            sigmoid (`bool`)
+                Whether to apply a sigmoid on the threshold
+        Returns:
+            mask (`torch.FloatTensor`)
+                Binary matrix of the same size as `inputs` acting as a mask (1 - the associated weight is
+                retained, 0 - the associated weight is pruned).
+        """
+        # Get the subnetwork by sorting the inputs and using the top threshold
+        if sigmoid:
+            threshold = torch.sigmoid(threshold).item()
+        ctx.sigmoid = sigmoid
+        mask = inputs.clone()
+
+        _, idx = inputs.flatten().sort(descending=True)
+        j = math.ceil(threshold * inputs.numel())
+
+        # flat_out and mask access the same memory.
+        flat_out = mask.flatten()
+        flat_out[idx[j:]] = 0.
+        flat_out[idx[:j]] = 1.
+        ctx.save_for_backward(mask)
+
+        return mask
+
+    @staticmethod
+    def backward(ctx, gradOutput):
+        mask, = ctx.saved_tensors
+        if ctx.sigmoid:
+            return gradOutput.clone(), ((gradOutput * mask).sum()).view(-1), None
+        else:
+            return gradOutput.clone(), None, None
+
+
+class SymQuantizer(torch.autograd.Function):
+    """
+    Symmetric quantization
+    """
+    @staticmethod
+    def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1):
+        """
+        Args:
+            inputs (`torch.FloatTensor`)
+                The input which needs to be quantized
+            num_bits (int, >=4)
+                Number of bits to use for quantization
+            min_value/max_vlue (torch.FloatTensor)
+                Used for static activation quantization
+            num_groups (int)
+                How many groups to partition the quantization into
+        Returns:
+            quantized_input (`torch.FloatTensor`)
+                Quantized input
+        """
+        assert (min_value is None
+                and max_value is None) or (min_value is not None
+                                           and max_value is not None and num_groups == 1)
+        q_range = 2**num_bits
+        input_shape = input.shape
+        if min_value is None:
+            input = input.reshape(num_groups, -1)
+            max_input = torch.amax(torch.abs(input), dim=-1).view(num_groups, -1)
+        else:
+            max_input = torch.max(min_value.abs(), max_value).view(-1)
+
+        scale = 2 * max_input / q_range
+        output = (input / scale).round().clamp(-q_range // 2, q_range // 2 - 1) * scale
+        output = output.reshape(input_shape).contiguous()
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_input = grad_output.clone()
+        return grad_input, None, None, None, None
+
+
+class AsymQuantizer(torch.autograd.Function):
+    """
+    Asymmetric quantization
+    """
+    @staticmethod
+    def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1):
+        """
+        Args:
+            inputs (`torch.FloatTensor`)
+                The input which needs to be quantized
+            num_bits (int, >=4)
+                Number of bits to use for quantization
+            min_value/max_vlue (torch.FloatTensor)
+                Used for static activation quantization
+            num_groups (int)
+                How many groups to partition the quantization into
+        Returns:
+            quantized_input (`torch.FloatTensor`)
+                Quantized input
+        """
+
+        assert (min_value is None
+                and max_value is None) or (min_value is not None
+                                           and max_value is not None and num_groups == 1)
+        q_range = 2**num_bits
+        input_shape = input.shape
+        if min_value is None:
+            input = input.reshape(num_groups, -1)
+            min_value = input.amin(dim=-1, keepdim=True)
+            max_value = input.amax(dim=-1, keepdim=True)
+
+        scale = (max_value - min_value) / q_range
+        zero_point = (min_value / scale).round() * scale
+
+        output = (
+            (input - zero_point) / scale).round().clamp(0,
+                                                        q_range - 1) * scale + zero_point
+        output = output.reshape(input_shape).contiguous()
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_input = grad_output.clone()
+        return grad_input, None, None, None, None
+
+
+class TernaryQuantizer(torch.autograd.Function):
+    """
+    Ternary quantization
+    """
+    @staticmethod
+    def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1):
+        """
+        Args:
+            inputs (`torch.FloatTensor`)
+                The input which needs to be quantized
+            num_bits (int)
+                Dummy variable
+            min_value/max_vlue (torch.FloatTensor)
+                Used for static activation quantization; for now they are dummy variable
+            num_groups (int)
+                How many groups to partition the quantization into
+        Returns:
+            quantized_input (`torch.FloatTensor`)
+                Quantized input
+        """
+
+        assert (min_value is None and max_value is None)
+        input_flat = input.reshape(num_groups, -1)
+        n = input_flat.shape[1]
+        m = input_flat.norm(p=1, dim=1).div(n)
+        thres = (0.7 * m).view(-1, 1)
+        pos = (input_flat > thres).type(input.type())
+        neg = (input_flat < -thres).type(input.type())
+        mask = (input_flat.abs() > thres).type(input.type())
+        alpha = ((mask * input_flat).abs().sum(dim=1) / mask.sum(dim=1)).view(-1, 1)
+        output = alpha * pos - alpha * neg
+        output = output.reshape(input.shape).contiguous()
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_input = grad_output.clone()
+        return grad_input, None, None, None, None
+
+
+class BinaryQuantizer(torch.autograd.Function):
+    """
+    Binary quantization
+    """
+    @staticmethod
+    def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1):
+        """
+        Args:
+            inputs (`torch.FloatTensor`)
+                The input which needs to be quantized
+            num_bits (int)
+                Dummy variable
+            min_value/max_vlue (torch.FloatTensor)
+                Used for static activation quantization; for now they are dummy variable
+            num_groups (int)
+                How many groups to partition the quantization into
+        Returns:
+            quantized_input (`torch.FloatTensor`)
+                Quantized input
+        """
+
+        assert (min_value is None and max_value is None)
+        input_flat = input.reshape(num_groups, -1)
+        n = input_flat.shape[1]
+        m = input_flat.norm(p=1, dim=1, keepdim=True).div(n)
+        output = input_flat.sign().mul(m)
+        output = output.reshape(input.shape).contiguous()
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_input = grad_output.clone()
+        return grad_input, None, None, None, None
diff --git a/deepspeed/constants.py b/deepspeed/constants.py
index 9576c9c078d5dcd4c0927c2750d0371032ecc413..9e7f511e443ffeb9279869e12b834139417f386f 100644
--- a/deepspeed/constants.py
+++ b/deepspeed/constants.py
@@ -14,6 +14,5 @@ TORCH_DISTRIBUTED_DEFAULT_PORT = 29500
 # To make an attempt at backwards compatibility with THD, we use an
 # extraordinarily high default timeout, given that THD did not have timeouts.
 default_pg_timeout = timedelta(minutes=30)
-
 INFERENCE_GENERIC_MODE = 'generic'
 INFERENCE_SPECIALIZED_MODE = 'specialized'
diff --git a/deepspeed/elasticity/__init__.py b/deepspeed/elasticity/__init__.py
index be517de7df93230a536abdb067acf3f564a3c62e..be95e3a3ee1935a13063efe4fcf128451f44775b 100644
--- a/deepspeed/elasticity/__init__.py
+++ b/deepspeed/elasticity/__init__.py
@@ -1 +1,7 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from .elasticity import compute_elastic_config, elasticity_enabled, ensure_immutable_elastic_config
+from .utils import is_torch_elastic_compatible
+from .constants import ENABLED, ENABLED_DEFAULT, ELASTICITY
+if is_torch_elastic_compatible():
+    from .elastic_agent import DSElasticAgent
diff --git a/deepspeed/elasticity/config.py b/deepspeed/elasticity/config.py
index 67db58c70e71d3107cfa994dd6df674b32c479a8..ffbce7028e03204f5d42fee92d3ffcf915b33fa5 100644
--- a/deepspeed/elasticity/config.py
+++ b/deepspeed/elasticity/config.py
@@ -79,6 +79,7 @@ class ElasticityConfig:
 
         self.min_gpus = param_dict.get(MIN_GPUS, MIN_GPUS_DEFAULT)
         self.max_gpus = param_dict.get(MAX_GPUS, MAX_GPUS_DEFAULT)
+
         if self.min_gpus < 1 or self.max_gpus < 1:
             raise ElasticityConfigError(
                 "Elasticity min/max gpus must be > 0, "
@@ -88,6 +89,20 @@ class ElasticityConfig:
                 "Elasticity min_gpus cannot be greater than max_gpus, "
                 f"given min_gpus: {self.min_gpus}, max_gpus: {self.max_gpus}")
 
+        self.model_parallel_size = param_dict.get(MODEL_PARLLEL_SIZE,
+                                                  MODEL_PARLLEL_SIZE_DEFAULT)
+        if self.model_parallel_size < 1:
+            raise ElasticityConfigError(
+                "Model-Parallel size cannot be less than 1, "
+                f"given model-parallel size: {self.model_parallel_size}")
+
+        self.num_gpus_per_node = param_dict.get(NUM_GPUS_PER_NODE,
+                                                NUM_GPUS_PER_NODE_DEFAULT)
+        if self.num_gpus_per_node < 1:
+            raise ElasticityConfigError(
+                "Number of GPUs per node cannot be less than 1, "
+                f"given number of GPUs per node: {self.num_gpus_per_node}")
+
         self.min_time = param_dict.get(MIN_TIME, MIN_TIME_DEFAULT)
         if self.min_time < 0:
             raise ElasticityConfigError(
diff --git a/deepspeed/elasticity/constants.py b/deepspeed/elasticity/constants.py
index cf428b70a0f40b837750304f603d8ad22dbc8632..eb40edc84711b1e64cfdebf367b960c7a8c84664 100644
--- a/deepspeed/elasticity/constants.py
+++ b/deepspeed/elasticity/constants.py
@@ -27,7 +27,7 @@ Elasticity should be enabled as:
 ELASTICITY = 'elasticity'
 
 # Current elasticity version
-LATEST_ELASTICITY_VERSION = 0.1
+LATEST_ELASTICITY_VERSION = 0.2
 
 ENABLED = 'enabled'
 ENABLED_DEFAULT = False
@@ -46,6 +46,12 @@ MIN_GPUS_DEFAULT = 1
 MAX_GPUS = 'max_gpus'
 MAX_GPUS_DEFAULT = 10000
 
+NUM_GPUS_PER_NODE = 'num_gpus_per_node'
+NUM_GPUS_PER_NODE_DEFAULT = 1
+
+MODEL_PARLLEL_SIZE = "model_parallel_size"
+MODEL_PARLLEL_SIZE_DEFAULT = 1
+
 # Minimum running time (minutes) before the scheduler will scale us, 0 implies it's unknown
 MIN_TIME = "min_time"
 MIN_TIME_DEFAULT = 0
diff --git a/deepspeed/elasticity/elastic_agent.py b/deepspeed/elasticity/elastic_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..db0a61b4fa7ae9766fc52e10f2291883f5164597
--- /dev/null
+++ b/deepspeed/elasticity/elastic_agent.py
@@ -0,0 +1,189 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from torch.distributed.elastic.agent.server.local_elastic_agent import LocalElasticAgent
+from typing import Any, Dict, Optional, Tuple
+from datetime import datetime
+from torch.distributed.elastic.agent.server.api import log, _get_socket_with_port
+from torch.distributed.elastic.metrics import put_metric
+from torch.distributed.elastic.agent.server.api import (
+    RunResult,
+    WorkerGroup,
+    WorkerSpec,
+    WorkerState,
+)
+from torch.distributed import Store
+import time
+import os
+from torch.distributed.elastic.multiprocessing import start_processes
+from torch.distributed.elastic.utils import macros
+import shutil
+import copy
+from contextlib import closing
+import subprocess
+
+
+class DSElasticAgent(LocalElasticAgent):
+    def __init__(
+        self,
+        spec: WorkerSpec,
+        env: Dict,
+        start_method="spawn",
+        exit_barrier_timeout: float = 300,
+        log_dir: Optional[str] = None,
+    ):
+        super().__init__(spec, start_method, exit_barrier_timeout, log_dir)
+        self.ds_env = env
+
+    @staticmethod
+    def _set_master_addr_port(store: Store,
+                              master_addr: Optional[str],
+                              master_port: Optional[int]):
+        if master_port is None:
+            sock = _get_socket_with_port()
+            with closing(sock):
+                master_port = sock.getsockname()[1]
+
+        if master_addr is None:
+            # master_addr = _get_fq_hostname()
+            result = subprocess.check_output("hostname -I", shell=True)
+            master_addr = result.decode('utf-8').split()[0]
+
+        store.set("MASTER_ADDR", master_addr.encode(encoding="UTF-8"))
+        store.set("MASTER_PORT", str(master_port).encode(encoding="UTF-8"))
+
+    def _start_workers(self, worker_group: WorkerGroup) -> Dict[int, Any]:
+        spec = worker_group.spec
+        store = worker_group.store
+        assert store is not None
+        master_addr, master_port = super()._get_master_addr_port(store)
+        restart_count = spec.max_restarts - self._remaining_restarts
+
+        use_agent_store = spec.rdzv_handler.get_backend() == "static"
+
+        args: Dict[int, Tuple] = {}
+        envs: Dict[int, Dict[str, str]] = {}
+        for worker in worker_group.workers:
+            local_rank = worker.local_rank
+
+            worker_env_ds = copy.deepcopy(self.ds_env)
+            worker_env_elastic = {
+                "LOCAL_RANK": str(local_rank),
+                "RANK": str(worker.global_rank),
+                "GROUP_RANK": str(worker_group.group_rank),
+                "ROLE_RANK": str(worker.role_rank),
+                "ROLE_NAME": spec.role,
+                "LOCAL_WORLD_SIZE": str(spec.local_world_size),
+                "WORLD_SIZE": str(worker.world_size),
+                "GROUP_WORLD_SIZE": str(worker_group.group_world_size),
+                "ROLE_WORLD_SIZE": str(worker.role_world_size),
+                "MASTER_ADDR": master_addr,
+                "MASTER_PORT": str(master_port),
+                "TORCHELASTIC_RESTART_COUNT": str(restart_count),
+                "TORCHELASTIC_MAX_RESTARTS": str(spec.max_restarts),
+                "TORCHELASTIC_RUN_ID": spec.rdzv_handler.get_run_id(),
+                "TORCHELASTIC_USE_AGENT_STORE": str(use_agent_store),
+                "NCCL_ASYNC_ERROR_HANDLING": os.getenv("NCCL_ASYNC_ERROR_HANDLING",
+                                                       str(1)),
+            }
+            worker_env_ds.update(worker_env_elastic)
+            if "OMP_NUM_THREADS" in os.environ:
+                worker_env_ds["OMP_NUM_THREADS"] = os.environ["OMP_NUM_THREADS"]
+
+            envs[local_rank] = worker_env_ds
+            worker_args = list(spec.args)
+            worker_args = macros.substitute(worker_args, str(local_rank))
+            args[local_rank] = tuple(worker_args)
+
+        # scaling events do not count towards restarts (gets same attempt #)
+        # remove existing log dir if this restart is due to a scaling event
+        attempt_log_dir = os.path.join(self._log_dir, f"attempt_{restart_count}")
+        shutil.rmtree(attempt_log_dir, ignore_errors=True)
+        os.makedirs(attempt_log_dir)
+
+        assert spec.entrypoint is not None
+        self._pcontext = start_processes(
+            name=spec.role,
+            entrypoint=spec.entrypoint,
+            args=args,
+            envs=envs,
+            log_dir=attempt_log_dir,
+            start_method=self._start_method,
+            redirects=spec.redirects,
+            tee=spec.tee,
+        )
+
+        return self._pcontext.pids()
+
+    def _invoke_run(self, role: str = "default") -> RunResult:
+        # NOTE: currently only works for a single role
+
+        spec = self._worker_group.spec
+        role = spec.role
+
+        log.info(
+            f"[{role}] starting workers for entrypoint: {spec.get_entrypoint_name()}")
+
+        self._initialize_workers(self._worker_group)
+        monitor_interval = spec.monitor_interval
+        rdzv_handler = spec.rdzv_handler
+
+        participants = rdzv_handler._state_holder.state.participants
+
+        while True:
+            assert self._worker_group.state != WorkerState.INIT
+            time.sleep(monitor_interval)
+            run_result = self._monitor_workers(self._worker_group)
+            state = run_result.state
+            self._worker_group.state = state
+
+            expire_time = datetime.utcnow() - (
+                rdzv_handler._settings.keep_alive_interval *
+                rdzv_handler._settings.keep_alive_max_attempt)
+            _dead_nodes = [
+                node for node,
+                last_heartbeat in
+                rdzv_handler._state_holder.state.last_heartbeats.items()
+                if last_heartbeat < expire_time
+            ]
+
+            put_metric(f"workers.{role}.remaining_restarts", self._remaining_restarts)
+            put_metric(f"workers.{role}.{state.name.lower()}", 1)
+
+            if state == WorkerState.SUCCEEDED:
+                log.info(
+                    f"[{role}] worker group successfully finished."
+                    f" Waiting {self._exit_barrier_timeout} seconds for other agents to finish."
+                )
+                self._exit_barrier()
+                return run_result
+            elif state in {
+                    WorkerState.UNHEALTHY,
+                    WorkerState.FAILED
+            } or len(participants) > len(rdzv_handler._state_holder.state.participants):
+                if self._remaining_restarts > 0:
+                    log.info(
+                        f"[{role}] Worker group {state.name}. "
+                        f"{self._remaining_restarts}/{spec.max_restarts} attempts left;"
+                        f" will restart worker group")
+                    self._remaining_restarts -= 1
+                    # rdzv_handler._state_holder.state.restart = False
+                    self._restart_workers(self._worker_group)
+                    participants = rdzv_handler._state_holder.state.participants
+
+                else:
+                    self._stop_workers(self._worker_group)
+                    self._worker_group.state = WorkerState.FAILED
+                    self._exit_barrier()
+                    return run_result
+            elif state == WorkerState.HEALTHY:
+                # membership changes do not count as retries
+                num_nodes_waiting = rdzv_handler.num_nodes_waiting()
+                group_rank = self._worker_group.group_rank
+                if num_nodes_waiting > 0:
+                    log.info(f"[{role}] Detected {num_nodes_waiting} "
+                             f"new nodes from group_rank={group_rank}; "
+                             f"will restart worker group")
+                    self._restart_workers(self._worker_group)
+                    participants = rdzv_handler._state_holder.state.participants
+            else:
+                raise Exception(f"[{role}] Worker group in {state.name} state")
diff --git a/deepspeed/elasticity/elasticity.py b/deepspeed/elasticity/elasticity.py
index e678d5ed836e09c1164629400152f0b4dd62e46a..17a8b6ecf39466869a8254d5c19ba49ea39bd741 100644
--- a/deepspeed/elasticity/elasticity.py
+++ b/deepspeed/elasticity/elasticity.py
@@ -2,17 +2,15 @@
 Copyright 2020 The Microsoft DeepSpeed Team
 """
 import os
-import re
 import json
 import numpy as np
-
+import math
 from packaging import version as pkg_version
 
 from .config import ElasticityConfig, ElasticityConfigError, ElasticityError, \
     ElasticityIncompatibleWorldSize
 from .constants import ELASTICITY, ENABLED, ENABLED_DEFAULT, LATEST_ELASTICITY_VERSION, \
-    MINIMUM_DEEPSPEED_VERSION, IGNORE_NON_ELASTIC_BATCH_INFO, \
-    IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT, DEEPSPEED_ELASTICITY_CONFIG
+    MINIMUM_DEEPSPEED_VERSION, DEEPSPEED_ELASTICITY_CONFIG
 from ..git_version_info import version as __version__
 from ..utils import logger
 
@@ -93,7 +91,6 @@ def get_valid_gpus(batch_size, micro_batches, min_valid_gpus, max_valid_gpus):
                     valid_gpus.append(i)
     valid_gpus = set(valid_gpus)
     valid_gpus = sorted(list(valid_gpus))
-    logger.info(f"Valid GPUs: {valid_gpus}")
     return valid_gpus
 
 
@@ -173,6 +170,70 @@ def _get_compatible_gpus_v01(micro_batches,
     return final_batch_size, valid_gpus
 
 
+def _get_compatible_gpus_v02(micro_batches,
+                             max_acceptable_batch_size,
+                             current_num_gpus,
+                             min_gpus=None,
+                             max_gpus=None,
+                             prefer_larger=True,
+                             num_gpus_per_node=1,
+                             model_parallel_size=1):
+    '''
+    Returns:
+        final_batch_size
+        valid_gpus
+        micro-batch size
+    '''
+    if num_gpus_per_node % model_parallel_size != 0:
+        raise ElasticityError(
+            f"In Elasticity v0.2, number of GPUs per node:" \
+            f"{num_gpus_per_node} should be divisible by " \
+            f"model parallel size {model_parallel_size}")
+
+    def get_microbatch(final_batch_size):
+        candidate_microbatch = None
+
+        for micro_batch in micro_batches:
+            if final_batch_size // current_num_gpus % micro_batch == 0:
+                if candidate_microbatch == None:
+                    candidate_microbatch = micro_batch
+                if prefer_larger and candidate_microbatch < micro_batch:
+                    candidate_microbatch = micro_batch
+        return candidate_microbatch
+
+    dp_size_per_node = num_gpus_per_node // model_parallel_size
+
+    final_batch_size, valid_world_size = _get_compatible_gpus_v01(micro_batches,
+                             int(max_acceptable_batch_size/dp_size_per_node),
+                             int(min_gpus/num_gpus_per_node),
+                             int(max_gpus/num_gpus_per_node), # Passing number of max nodes as Elasticity v2 works at node level
+                             prefer_larger=prefer_larger)
+
+    final_batch_size = int(final_batch_size) * dp_size_per_node
+    valid_dp_world_size = [i * dp_size_per_node for i in valid_world_size]
+    if current_num_gpus // model_parallel_size in valid_dp_world_size:
+        candidate_microbatch = get_microbatch(final_batch_size)
+        return final_batch_size, valid_dp_world_size, candidate_microbatch
+
+    current_dp_size = (current_num_gpus / num_gpus_per_node) * dp_size_per_node
+    candidate_batch_sizes = []
+    for micro_batch in micro_batches:
+        min_batch_size = micro_batch * current_dp_size
+
+        factor = math.floor(max_acceptable_batch_size / float(min_batch_size))
+        candidate_batch_sizes.append(factor * min_batch_size)
+
+    used_microbatch = None
+    if prefer_larger:
+        candidate_batch_size = max(candidate_batch_sizes)
+    else:
+        candidate_batch_size = min(candidate_batch_sizes)
+
+    candidate_microbatch = get_microbatch(candidate_batch_size)
+
+    return candidate_batch_size, [int(current_dp_size)], candidate_microbatch
+
+
 def _compatible_ds_version_check(target_deepspeed_version: str):
     min_version = pkg_version.parse(MINIMUM_DEEPSPEED_VERSION)
     target_version = pkg_version.parse(target_deepspeed_version)
@@ -223,7 +284,10 @@ def ensure_immutable_elastic_config(runtime_elastic_config_dict: dict):
             "guarantee resource scheduler will scale this job using compatible GPU counts.")
 
 
-def compute_elastic_config(ds_config: dict, target_deepspeed_version: str, world_size=0):
+def compute_elastic_config(ds_config: dict,
+                           target_deepspeed_version: str,
+                           world_size=0,
+                           return_microbatch=False):
     """Core deepspeed elasticity API. Given an elastic config (similar to the example below)
     DeepSpeed will compute a total train batch size corresponding valid GPU count list that
     provides a high level of elasticity. Elasticity in this case means we are safe to scale
@@ -250,8 +314,9 @@ def compute_elastic_config(ds_config: dict, target_deepspeed_version: str, world
         target_deepspeed_version (str): When called from scheduling
             infrastructure we want to ensure that the target deepspeed version is
             compatible with the elasticity version used in the backend.
-        world_size (int, optional): Intended/current world size, will do some sanity
+        world_size (int, optional): Intended/current DP world size, will do some sanity
             checks to ensure world size is actually valid with the config.
+        return_microbatch (bool, optional): whether to return micro batch size or not.
 
     Raises:
         ElasticityConfigError: Missing required elasticity config or elasticity disabled
@@ -277,6 +342,13 @@ def compute_elastic_config(ds_config: dict, target_deepspeed_version: str, world
             "('enabled':true) if running an elastic training job.")
 
     elastic_config = ElasticityConfig(elastic_config_dict)
+    model_parallel_size = elastic_config.model_parallel_size
+    num_gpus_per_node = elastic_config.num_gpus_per_node
+
+    if model_parallel_size > 1 and float(elastic_config.version) != 0.2:
+        raise ElasticityConfigError(f"Elasticity V{elastic_config.version} " \
+            f"does not support model-parallel training. Given model-parallel size: " \
+            f"{model_parallel_size}")
 
     if float(elastic_config.version) > LATEST_ELASTICITY_VERSION:
         raise ElasticityConfigError("Attempting to run elasticity version " \
@@ -297,10 +369,39 @@ def compute_elastic_config(ds_config: dict, target_deepspeed_version: str, world
             prefer_larger=elastic_config.prefer_larger_batch_size)
         # ensure batch size is int dtype
         final_batch_size = int(final_batch_size)
+    elif float(elastic_config.version) == 0.2:
+        if world_size != 0:
+            current_num_gpus = world_size
+        else:
+            if "WORLD_SIZE" in os.environ and \
+                os.getenv('WORLD_SIZE').isnumeric():
+                current_num_gpus = int(os.getenv('WORLD_SIZE'))
+            else:
+                WORLD_SIZE = os.getenv('WORLD_SIZE')
+                raise ElasticityConfigError(
+                    'Elasticity V 0.2 needs WORLD_SIZE '\
+                    'to compute valid batch size. '\
+                    'Either give it as argument to function compute_elastic_config '\
+                    'or set it as an environment variable. '\
+                    f'Value of WORLD_SIZE as environment variable is {WORLD_SIZE}')
+
+        final_batch_size, valid_gpus, candidate_microbatch_size = _get_compatible_gpus_v02(
+            micro_batches=elastic_config.micro_batches,
+            max_acceptable_batch_size=elastic_config.max_acceptable_batch_size,
+            current_num_gpus=current_num_gpus,
+            min_gpus=elastic_config.min_gpus,
+            max_gpus=elastic_config.max_gpus,
+            prefer_larger=elastic_config.prefer_larger_batch_size,
+            num_gpus_per_node=num_gpus_per_node,
+            model_parallel_size=model_parallel_size)
+        # ensure batch size is int dtype
+        final_batch_size = int(final_batch_size)
     else:
         raise NotImplementedError(
             f"Unable to find elastic logic for version: {elastic_config.version}")
 
+    logger.info(f"Valid World Size (GPUs / Model Parallel Size): {valid_gpus}")
+
     if world_size > 0:
         if world_size not in valid_gpus:
             raise ElasticityIncompatibleWorldSize(f"World size ({world_size}) is not valid " \
@@ -317,4 +418,19 @@ def compute_elastic_config(ds_config: dict, target_deepspeed_version: str, world
             f" micro_batches={elastic_config.micro_batches}."
         return final_batch_size, valid_gpus, micro_batch_size
 
+    if return_microbatch:
+        # Pick a valid micro batch size
+        if float(elastic_config.version) == 0.2:
+            return final_batch_size, valid_gpus, candidate_microbatch_size
+        else:
+            micro_batch_size = None
+            for mbsz in sorted(list(set(elastic_config.micro_batches)), reverse=True):
+                if final_batch_size // world_size % mbsz == 0:
+                    micro_batch_size = mbsz
+                    break
+            assert micro_batch_size is not None, "Unable to find divisible micro batch size" \
+                    f" world_size={world_size}, final_batch_size={final_batch_size}, and " \
+                    f" micro_batches={elastic_config.micro_batches}."
+            return final_batch_size, valid_gpus, micro_batch_size
+
     return final_batch_size, valid_gpus
diff --git a/deepspeed/elasticity/utils.py b/deepspeed/elasticity/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fba6949be032775581cfa00203d98681c4c623a
--- /dev/null
+++ b/deepspeed/elasticity/utils.py
@@ -0,0 +1,16 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+
+
+def is_torch_elastic_compatible():
+    '''
+        Helper to lookup torch version. Elastic training is
+        introduced in 1.11.x
+    '''
+    TORCH_MAJOR = int(torch.__version__.split('.')[0])
+    TORCH_MINOR = int(torch.__version__.split('.')[1])
+    if TORCH_MAJOR == 1 and TORCH_MINOR >= 11:
+        return True
+    else:
+        return False
diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py
index aad18366587059d1229c3ae55e162b023e2a7ea0..e8be40a0535d576434b1cacf13991f2b201c199a 100644
--- a/deepspeed/env_report.py
+++ b/deepspeed/env_report.py
@@ -1,10 +1,12 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import torch
 import deepspeed
 import subprocess
 import argparse
-from .ops.op_builder import ALL_OPS
+from .ops.op_builder.all_ops import ALL_OPS
 from .git_version_info import installed_ops, torch_info
-from .ops import __compatible_ops__ as compatible_ops
+from deepspeed.accelerator import get_accelerator
 
 GREEN = '\033[92m'
 RED = '\033[91m'
@@ -54,7 +56,7 @@ def op_report(verbose=True):
 
 def ninja_installed():
     try:
-        import ninja
+        import ninja  # noqa: F401
     except ImportError:
         return False
     return True
@@ -80,31 +82,33 @@ def nvcc_version():
 def debug_report():
     max_dots = 33
 
-    hip_version = None
-    if hasattr(torch.version, 'hip'):
-        hip_version = torch.version.hip
-
     report = [
         ("torch install path",
          torch.__path__),
         ("torch version",
          torch.__version__),
-        ("torch cuda version",
-         torch.version.cuda),
-        ("torch hip version",
-         hip_version),
-        ("nvcc version",
-         (None if hip_version else nvcc_version())),
         ("deepspeed install path",
          deepspeed.__path__),
         ("deepspeed info",
          f"{deepspeed.__version__}, {deepspeed.__git_hash__}, {deepspeed.__git_branch__}"
-         ),
-        ("deepspeed wheel compiled w.",
-         f"torch {torch_info['version']}, " +
-         (f"hip {torch_info['hip_version']}"
-          if hip_version else f"cuda {torch_info['cuda_version']}")),
+         )
     ]
+    if get_accelerator().device_name() == 'cuda':
+        hip_version = getattr(torch.version, "hip", None)
+        report.extend([("torch cuda version",
+                        torch.version.cuda),
+                       ("torch hip version",
+                        hip_version),
+                       ("nvcc version",
+                        (None if hip_version else nvcc_version())),
+                       ("deepspeed wheel compiled w.",
+                        f"torch {torch_info['version']}, " +
+                        (f"hip {torch_info['hip_version']}"
+                         if hip_version else f"cuda {torch_info['cuda_version']}"))])
+    else:
+        report.extend([("deepspeed wheel compiled w.",
+                        f"torch {torch_info['version']} ")])
+
     print("DeepSpeed general environment info:")
     for name, value in report:
         print(name, "." * (max_dots - len(name)), value)
diff --git a/deepspeed/git_version_info.py b/deepspeed/git_version_info.py
index a806475c397b2927fce14930a19e169ac50af2a0..dfb8864fe6874d7c10d25a1f998cbbad9806d5d3 100644
--- a/deepspeed/git_version_info.py
+++ b/deepspeed/git_version_info.py
@@ -1,6 +1,8 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 try:
     #  This is populated by setup.py
-    from .git_version_info_installed import *
+    from .git_version_info_installed import *  # noqa: F401
 except ModuleNotFoundError:
     import os
     if os.path.isfile('version.txt'):
@@ -11,7 +13,7 @@ except ModuleNotFoundError:
     git_hash = '[none]'
     git_branch = '[none]'
 
-    from .ops.op_builder import ALL_OPS
+    from .ops.op_builder.all_ops import ALL_OPS
     installed_ops = dict.fromkeys(ALL_OPS.keys(), False)
     compatible_ops = dict.fromkeys(ALL_OPS.keys(), False)
     torch_info = {'version': "0.0", "cuda_version": "0.0", "hip_version": "0.0"}
diff --git a/deepspeed/inference/__init__.py b/deepspeed/inference/__init__.py
index 8ee60d6547b43768ea483274ed88d89895f6c7e9..449e6651e5d414a936fdeeef5055d14809a06dcd 100644
--- a/deepspeed/inference/__init__.py
+++ b/deepspeed/inference/__init__.py
@@ -1 +1,3 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from .engine import InferenceEngine
diff --git a/deepspeed/inference/config.py b/deepspeed/inference/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..61298db3fbd470c625fe7eb04ba8a5e7a6e4323e
--- /dev/null
+++ b/deepspeed/inference/config.py
@@ -0,0 +1,278 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
+from pydantic import Field
+from pydantic import validator
+from typing import Dict, Union
+from enum import Enum
+
+
+class DtypeEnum(Enum):
+    # The torch dtype must always be the first value (so we return torch.dtype)
+    fp16 = torch.float16, "torch.float16", "fp16", "float16", "half"
+    bf16 = torch.bfloat16, "torch.bfloat16", "bf16", "bfloat16"
+    fp32 = torch.float32, "torch.float32", "fp32", "float32", "float"
+    int8 = torch.int8, "torch.int8", "int8"
+
+    # Copied from https://stackoverflow.com/a/43210118
+    # Allows us to use multiple values for each Enum index and returns first
+    # listed value when Enum is called
+    def __new__(cls, *values):
+        obj = object.__new__(cls)
+        # first value is canonical value
+        obj._value_ = values[0]
+        for other_value in values[1:]:
+            cls._value2member_map_[other_value] = obj
+        obj._all_values = values
+        return obj
+
+    def __repr__(self):
+        return "<%s.%s: %s>" % (
+            self.__class__.__name__,
+            self._name_,
+            ", ".join([repr(v) for v in self._all_values]),
+        )
+
+
+class MoETypeEnum(str, Enum):
+    residual = "residual"
+    standard = "standard"
+
+
+class DeepSpeedTPConfig(DeepSpeedConfigModel):
+    """ Configure tensor parallelism settings """
+
+    enabled: bool = True
+    """ Turn tensor parallelism on/off. """
+
+    tp_size: int = 1
+    """ Number of devices to split the model across using tensor parallelism. """
+
+    mpu: object = None
+    """
+    A model parallelism unit object that implements
+    ``get_{model,data}_parallel_{rank,group,world_size}()``.
+    """
+
+    tp_group: object = None
+
+
+class DeepSpeedMoEConfig(DeepSpeedConfigModel):
+    """ Sets parameters for MoE """
+
+    enabled: bool = True
+    ep_size: int = 1
+    """
+    The expert-parallelism size which is used for partitioning the experts
+    across the GPUs in the expert-parallel group.
+    """
+
+    moe_experts: list = Field([1], alias="num_experts")
+    """ The global number of experts used in an MoE layer. """
+
+    type: MoETypeEnum = MoETypeEnum.standard
+    """
+    Specify the type of MoE layer. We have two types of MoE layer: 'Standard'
+    and 'Residual'.
+    """
+
+    ep_mp_group: object = None
+    ep_group: object = Field(None, alias="expert_group")
+
+
+class QuantTypeEnum(str, Enum):
+    asym = "asymmetric"
+    sym = "symmetric"
+
+
+class BaseQuantConfig(DeepSpeedConfigModel):
+    enabled = True
+    num_bits = 8
+    q_type: QuantTypeEnum = QuantTypeEnum.sym
+    q_groups: int = 1
+
+
+class WeightQuantConfig(BaseQuantConfig):
+    enabled = True
+
+
+class ActivationQuantConfig(BaseQuantConfig):
+    enabled = True
+
+
+class QKVQuantConfig(DeepSpeedConfigModel):
+    enabled = True
+
+
+class QuantizationConfig(DeepSpeedConfigModel):
+    enabled: bool = True
+    activation: ActivationQuantConfig = ActivationQuantConfig()
+    weight: WeightQuantConfig = WeightQuantConfig()
+    qkv: QKVQuantConfig = QKVQuantConfig()
+
+
+# todo: brainstorm on how to do ckpt loading for DS inference
+class InferenceCheckpointConfig(DeepSpeedConfigModel):
+    checkpoint_dir: str = None
+    save_mp_checkpoint_path: str = None
+    base_dir: str = None
+
+
+class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
+    """ Sets parameters for DeepSpeed Inference Engine. """
+
+    replace_with_kernel_inject: bool = Field(False, alias="kernel_inject")
+    """
+    Set to true to inject inference kernels for models such as, Bert, GPT2,
+    GPT-Neo and GPT-J.  Otherwise, the injection_dict provides the names of two
+    linear layers as a tuple:
+    `(attention_output projection, transformer output projection)`
+    """
+
+    dtype: DtypeEnum = torch.float16
+    """
+    Desired model data type, will convert model to this type.
+    Supported target types: `torch.half`, `torch.int8`, `torch.float`
+    """
+
+    tensor_parallel: DeepSpeedTPConfig = Field({}, alias="tp")
+    """
+    Configuration for tensor parallelism used to split the model across several
+    GPUs. Expects a dictionary containing values for :any:`DeepSpeedTPConfig`.
+    """
+
+    enable_cuda_graph: bool = False
+    """
+    Use this flag for capturing the CUDA-Graph of the inference ops, so that it
+    can run faster using the graph replay method.
+    """
+
+    zero: DeepSpeedZeroConfig = {}
+    """
+    ZeRO configuration to use with the Inference Engine. Expects a dictionary
+    containing values for :any:`DeepSpeedZeroConfig`.
+    """
+
+    triangular_masking: bool = Field(True, alias="tm")
+    """
+    Controls the type of masking for attention scores in transformer layer.
+    Note that the masking is application specific.
+    """
+
+    moe: Union[bool, DeepSpeedMoEConfig] = {}
+    """
+    Specify if the type of Transformer is MoE. Expects a dictionary containing
+    values for :any:`DeepSpeedMoEConfig`.
+    """
+
+    quant: QuantizationConfig = {}
+    """
+    NOTE: only works for int8 dtype.
+    Quantization settings used for quantizing your model using the MoQ.  The
+    setting can be one element or a tuple. If one value is passed in, we
+    consider it as the number of groups used in quantization. A tuple is passed
+    in if we want to mention that there is extra-grouping for the MLP part of a
+    Transformer layer (e.g. (True, 8) shows we quantize the model using 8
+    groups for all the network except the MLP part that we use 8 extra
+    grouping). Expects a dictionary containing values for
+    :any:`QuantizationConfig`.
+    """
+
+    #todo: refactor the following 3 into the new checkpoint_config
+    checkpoint: str = None
+    """
+    Path to deepspeed compatible checkpoint or path to JSON with load policy.
+    """
+
+    base_dir: str = None
+    """
+    This shows the root directory under which all the checkpoint files exists.
+    This can be passed through the json config too.
+    """
+
+    save_mp_checkpoint_path: str = None
+    """
+    The path for which we want to save the loaded model with a checkpoint. This
+    feature is used for adjusting the parallelism degree to help alleviate the
+    model loading overhead. It does not save any new checkpoint if no path is
+    passed.
+    """
+
+    checkpoint_config: InferenceCheckpointConfig = Field({}, alias="ckpt_config")
+    """
+    TODO: Add docs. Expects a dictionary containing values for
+    :any:`InferenceCheckpointConfig`.
+    """
+
+    return_tuple: bool = True
+    """
+    Specify whether or not the transformer layers need to return a tuple or a
+    Tensor.
+    """
+
+    training_mp_size: int = 1
+    """
+    If loading a checkpoint this is the mp size that it was trained with, it
+    may be different than what the mp size that you want to use during
+    inference.
+    """
+
+    replace_method: str = Field(
+        "auto",
+        deprecated=True,
+        deprecated_msg=
+        "This parameter is no longer needed, please remove from your call to DeepSpeed-inference"
+    )
+
+    injection_policy: Dict = Field(None, alias="injection_dict")
+    """
+    Dictionary mapping a client nn.Module to its corresponding injection
+    policy. e.g., `{BertLayer : deepspeed.inference.HFBertLayerPolicy}`
+    """
+
+    injection_policy_tuple: tuple = None
+    """ TODO: Add docs """
+
+    config: Dict = Field(
+        None,
+        alias="args")  # todo: really no need for this field if we can refactor
+
+    max_out_tokens: int = Field(1024, alias="max_tokens")
+    """
+    This argument shows the maximum number of tokens inference-engine can work
+    with, including the input and output tokens. Please consider increasing it
+    to the required token-length required for your use-case.
+    """
+
+    mp_size: int = Field(1, deprecated=True, new_param="tensor_parallel.tp_size")
+    """
+    Desired model parallel size, default is 1 meaning no model parallelism.
+    Deprecated, please use the ``tensor_parallel` config to control model
+    parallelism.
+    """
+    mpu: object = Field(None, deprecated=True, new_param="tensor_parallel.mpu")
+    ep_size: int = Field(1, deprecated=True, new_param="moe.ep_size")
+    ep_group: object = Field(None,
+                             alias="expert_group",
+                             deprecated=True,
+                             new_param="moe.ep_group")
+    ep_mp_group: object = Field(None,
+                                alias="expert_mp_group",
+                                deprecated=True,
+                                new_param="moe.ep_mp_group")
+    moe_experts: list = Field([1], deprecated=True, new_param="moe.moe_experts")
+    moe_type: MoETypeEnum = Field(MoETypeEnum.standard,
+                                  deprecated=True,
+                                  new_param="moe.type")
+
+    @validator("moe")
+    def moe_backward_compat(cls, field_value, values):
+        if isinstance(field_value, bool):
+            return DeepSpeedMoEConfig(moe=field_value)
+        return field_value
+
+    class Config:
+        # Get the str representation of the datatype for serialization
+        json_encoders = {torch.dtype: lambda x: str(x)}
diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py
old mode 100644
new mode 100755
index a37f7c23f5990182e05120592fe8cdecf0c19980..123a028ce3a966e48353ed05ceae084c574724ce
--- a/deepspeed/inference/engine.py
+++ b/deepspeed/inference/engine.py
@@ -2,22 +2,34 @@
 Copyright 2021 The Microsoft DeepSpeed Team
 '''
 import torch
+import time
 import os
+
+from deepspeed import comm as dist
+from deepspeed.utils.logging import log_dist
+
 from torch.nn.modules import Module
-import torch.distributed as dist
+from packaging import version as pkg_version
+from deepspeed.runtime.checkpoint_engine.torch_checkpoint_engine import TorchCheckpointEngine
+from deepspeed.utils.timer import SynchronizedWallClockTimer
+
 from ..runtime.state_dict_factory import SDLoaderFactory
 from ..runtime.weight_quantizer import WeightQuantization
-from ..module_inject.replace_module import replace_transformer_layer
-from ..utils import logger, init_distributed
-
+from ..module_inject import replace_transformer_layer, generic_injection
+from ..comm.comm import init_distributed
 from ..pipe import PipelineModule
 from ..moe.utils import has_moe_layers
-from ..moe.layer import MoE
+from ..module_inject import LinearAllreduce, LinearLayer, Normalize, ReplaceWithTensorSlicing
+from deepspeed.accelerator import get_accelerator
+from ..module_inject.policy import TransformerPolicy
+from ..module_inject.auto_tp import AutoTP
 
-import torch.distributed as dist
-import deepspeed.utils.groups as groups
+from ..module_inject.replace_policy import generic_policies
 
 DS_INFERENCE_ENABLED = False
+from torch import nn
+
+INFERENCE_MODEL_TIMER = "model-forward-inference"
 
 
 class InferenceEngine(Module):
@@ -25,42 +37,11 @@ class InferenceEngine(Module):
     inference_ep_group = None
     expert_mp_group = None
 
-    def __init__(self,
-                 model,
-                 triangular_masking=True,
-                 mp_size=1,
-                 training_mp_size=1,
-                 ep_size=1,
-                 mpu=None,
-                 ep_group=None,
-                 expert_mp_group=None,
-                 checkpoint=None,
-                 dtype=None,
-                 injection_dict=None,
-                 return_tuple=True,
-                 replace_method='auto',
-                 quantization_setting=None,
-                 replace_with_kernel_inject=False,
-                 moe=False,
-                 moe_experts=1,
-                 moe_type='standard',
-                 config=None):
+    def __init__(self, model, config):
         """
         Args:
             model: torch.nn.Module
-            mp_size: model-parallel size
-            mpu: model-parallel unit (used for Megatron-type models)
-            checkpoint: the json-path, showing the address of model-checkpoints
-                Example: {type: 'Megatron', 'checkpoints': [ckpt_mp0.pt, ckpt_mp1.pt], 'version': 1.0}
-            dtype: data-type by which inference is executed
-            injection_dict: the dictionary that shows the injection policy:
-                Example: {BertLayer: HFBertLayerPolicy}
-            return_tuple: if true, inference-API returns a tuple, otherwise a tensor
-            replace_method: the injection method, this can be passed as auto if no injection-policy is defined, in which case the injection is automatic based on the available policies
-            quantization_setting:
-                one of None, Tuple(mlp_extra_grouping, quantize_groups), quantize_groups
-            replace_with_kernel_inject: this flag need to be set to true to inject inference kernels for models such as, Bert, GPT2, GPT-Neo and GPT-J. Otherwise,
-            the injection_dict provides the names of two linear layers as a tuple: (attention_output projection, transformer output projection)
+            config: DeepSpeedInferenceConfig
         """
         global DS_INFERENCE_ENABLED
         DS_INFERENCE_ENABLED = True
@@ -68,90 +49,165 @@ class InferenceEngine(Module):
         super().__init__()
 
         self.module = model
+        self._config = config
+
+        self._get_model_config_generate(config)  # keep for weird backward compatibility
 
-        self._get_model_config_generate(config)
+        # patch model generate with ours if model uses it
+        if hasattr(self.module, "generate"):
+            self.generate = self._generate
 
-        self.mp_world_size = mp_size
-        self.checkpoint = checkpoint
-        self.dtype = dtype
-        self.injection_dict = injection_dict
-        self.mp_group = None
-        self.mpu = mpu
-        self._validate_args(mpu)
-        self.replace_method = replace_method
+        if hasattr(self.module, "config"):
+            TransformerPolicy.hf_model_config = self.module.config
+
+        # todo: keep this self.injection_dict because we don't use to change config.injection_policy API
+        # todo: this will get changed when Molly's PR on auto injection dict is merged
+        self.injection_dict = config.injection_policy
+
+        # todo: refactor the mp_group and mp_size related in the next refactor
+        self.mp_group = config.tensor_parallel.tp_group
+        self.mpu = config.tensor_parallel.mpu
+
+        #self._validate_args(self.mpu, config.replace_with_kernel_inject)
         self.quantize_merge_count = 1
         self.quantization_scales = None
-        self.triangular_masking = triangular_masking
-        self.ep_size = ep_size
-        self.ep_group = ep_group
-        self.expert_mp_group = expert_mp_group
 
-        self._init_quantization_setting(quantization_setting)
+        # these are not needed in the config as we are creating them ourselves in the inference engine
+        self.ep_group = None  # config.moe.ep_group
+        self.expert_mp_group = None  # config.moe.ep_mp_group
+
+        self.cuda_graph_created = False
+        self.checkpoint_engine = TorchCheckpointEngine()
+        quantization_setting = None
+        self._init_quantization_setting(
+            quantization_setting
+        )  # todo: update with the new quant config for weight quant
+        self.model_profile_enabled = False
+        self._model_times = []
+
+        # This is a hack to remove the prepare_mask function on HF side for BLOOM architecture
+        self.remove_mask_prepare_for_bloom()
 
-        if self.checkpoint:
-            self._load_checkpoint(self.checkpoint)
+        if get_accelerator().device_name() == 'cuda' and config.enable_cuda_graph:
+            assert pkg_version.parse(torch.__version__) >= pkg_version.parse("1.10"), \
+                "If you want to use cuda graph, please upgrade torch to at least v1.10"
+
+        if config.checkpoint and not config.replace_with_kernel_inject:
+            self._load_checkpoint(config.checkpoint)
 
         # convert model to intended dtype
-        if self.dtype:
-            self._convert_to_dtype()
+        if config.dtype:
+            self._convert_to_dtype(config)
 
         if self.mpu:
-            self.mp_world_size = dist.get_world_size(
+            config.tensor_parallel.tp_size = dist.get_world_size(
                 group=self.mpu.get_model_parallel_group())
-            self.mp_group = mpu.get_model_parallel_group()
-        elif self.mp_world_size > 1:
-            self._create_model_parallel_group()
+            self.mp_group = self.mpu.get_model_parallel_group()
+        elif config.tensor_parallel.tp_size > 1:
+            self._create_model_parallel_group(config)
+            config.tensor_parallel.tp_group = self.mp_group
 
-        moe, _ = has_moe_layers(self.module)
+        if isinstance(self.module, torch.nn.Module):
+            moe, _ = has_moe_layers(self.module)
+        else:
+            moe = False
 
         if moe and dist.get_world_size() > 1:
-            self._create_ep_parallel_group(moe_experts)
+            self._create_ep_parallel_group(config.moe.moe_experts)
+
+        # retain this from the old conditional argument being passed to apply_injection_policy()
+        if not config.replace_with_kernel_inject:
+            config.checkpoint = None
 
+        # We only support three modes: 1) user specified policy for tensor-parallelism, 2) kernel injection (replace_with_kernel_inject), and 3) automatic tensor parallelism.
         if self.injection_dict:
+            # 1. User specified Tensor Parallelism
+            assert not config.replace_with_kernel_inject, "Cannot use both user specified injection policy and kernel injection"
             for client_module, injection_policy in self.injection_dict.items():
-                self._apply_injection_policy(client_module,
-                                             injection_policy,
-                                             return_tuple,
-                                             replace_with_kernel_inject,
-                                             moe,
-                                             moe_experts,
-                                             moe_type,
-                                             training_mp_size)
-        elif replace_method == 'auto':
-            self._apply_injection_policy(
-                return_tuple=return_tuple,
-                replace_with_kernel_inject=replace_with_kernel_inject,
-                moe=moe,
-                moe_experts=moe_experts,
-                moe_type=moe_type,
-                training_mp_size=training_mp_size)
-
-        device = torch.cuda.current_device()
-        logger.info(f"Place model to device: {device}")
+                # construct the tuple and pass that instead of a string or dict.
+                if isinstance(injection_policy, str):
+                    config.injection_policy_tuple = (injection_policy, )
+                else:
+                    config.injection_policy_tuple = injection_policy
+                self._apply_injection_policy(config, client_module)
+        else:
+            if config.replace_with_kernel_inject:
+                # 2. DeepSpeed Kernel Injection
+                self._apply_injection_policy(config)
+            else:
+                # 3. Automatic Tensor Parallelism
+                parser_dict = AutoTP.tp_parser(model)
+                print("AutoTP: ", parser_dict)
+                for client_module, injection_policy in parser_dict:
+                    if isinstance(injection_policy, str):
+                        config.injection_policy_tuple = (injection_policy, )
+                    else:
+                        config.injection_policy_tuple = injection_policy
+                    self._apply_injection_policy(config, client_module)
+
+        device = get_accelerator().current_device_name()
         self.module.to(device)
 
-        if self.mp_world_size > 1:
-            self.model_orig_fwd = self.module.forward
-            self.module.forward = self.forward
-        else:
+        if config.tensor_parallel.tp_size > 1:
+            _rng_state = get_accelerator().get_rng_state().to(
+                get_accelerator().current_device_name())
+            dist.broadcast(_rng_state, 0)
+            get_accelerator().set_rng_state(_rng_state.cpu())
+
+        if config.tensor_parallel.tp_size > 1:
+            assert not config.enable_cuda_graph, "Cuda graph is not supported for model parallelism"
+
+        # Check if local CUDA graphs can be created in replacement modules
+        self.local_cuda_graph = self._local_cuda_graph_used(self.module)
+
+    def profile_model_time(self, use_cuda_events=True):
+        if not self.model_profile_enabled and not self._config.enable_cuda_graph:
             self.module.register_forward_pre_hook(self._pre_forward_hook)
+            self.module.register_forward_hook(self._post_forward_hook)
+        self.model_profile_enabled = True
+        self.use_cuda_events = use_cuda_events
+        if self.use_cuda_events:
+            self.timers = SynchronizedWallClockTimer()
 
+    # todo: remove this once all the config dicts are centralized from top level pydantic config
     def _get_model_config_generate(self, config):
-        self.config = getattr(self.module, 'config', None) if config is None else config
-        self.generate = getattr(self.module, 'generate', None)
+        # this is being passed to replace_transformer_layer(config=self.user_model_config_dict)
+        self.config = getattr(self.module,
+                              'config',
+                              None) if config.config is None else config.config
+
+    def remove_mask_prepare_for_bloom(self):
+        if hasattr(self.module, 'transformer'):
+            if hasattr(self.module.transformer, '_prepare_attn_mask'):
+                self.module.transformer._prepare_attn_mask = lambda attention_mask, *args, **kwargs: attention_mask
+
+    def _pre_forward_hook(self, module, *inputs, **kwargs):
+        if self.use_cuda_events:
+            self.timers(INFERENCE_MODEL_TIMER).start()
+        else:
+            get_accelerator().synchronize()
+            self._start = time.time()
+
+    def _post_forward_hook(self, module, input, output):
+        if self.use_cuda_events:
+            self.timers(INFERENCE_MODEL_TIMER).stop()
+            elapsed_time = self.timers(INFERENCE_MODEL_TIMER).elapsed(reset=True)
+        else:
+            get_accelerator().synchronize()
+            self._end = time.time()
+            elapsed_time = self._end - self._start
+        self._model_times.append(elapsed_time)
 
-    def _create_model_parallel_group(self):
+    def _create_model_parallel_group(self, config):
         # Call the init process
         if InferenceEngine.inference_mp_group is None:
             init_distributed()
-
             local_rank = int(os.getenv('LOCAL_RANK', '0'))
-            torch.cuda.set_device(local_rank)
+            get_accelerator().set_device(local_rank)
 
-            ranks = [i for i in range(self.mp_world_size)]
+            ranks = [i for i in range(config.tensor_parallel.tp_size)]
             self.mp_group = dist.new_group(ranks)
             InferenceEngine.inference_mp_group = self.mp_group
-
         else:
             self.mp_group = InferenceEngine.inference_mp_group
 
@@ -194,66 +250,121 @@ class InferenceEngine(Module):
             self.quantize_groups = quantization_setting
         elif quantization_setting is not None:
             self.quantize_groups = quantization_setting
-        logger.info(f"quantize_bits = {self.quantize_bits} "
-                    f"mlp_extra_grouping = {self.mlp_extra_grouping}, "
-                    f"quantize_groups = {self.quantize_groups}")
-
-    def _validate_args(self, mpu):
-        if not isinstance(self.module, Module):
+        log_dist(
+            f"quantize_bits = {self.quantize_bits} "
+            f"mlp_extra_grouping = {self.mlp_extra_grouping}, "
+            f"quantize_groups = {self.quantize_groups}",
+            [0])
+
+    # TODO: remove this function and add this functionality to pydantic config checking
+    def _validate_args(self, mpu, replace_with_kernel_inject):
+        # TODO: to support SD pipeline we need to avoid this check for now
+        if replace_with_kernel_inject and not isinstance(self.module, Module):
             raise ValueError(f"model must be a torch.nn.Module, got {type(self.module)}")
-        if not isinstance(self.mp_world_size, int) or self.mp_world_size < 1:
-            raise ValueError(f"mp_size must be an int >= 1, got {self.mp_world_size}")
+        if not isinstance(self._config.tensor_parallel.tp_size,
+                          int) or self._config.tensor_parallel.tp_size < 1:
+            raise ValueError(
+                f"mp_size must be an int >= 1, got {self._config.tensor_parallel.tp_size}"
+            )
 
         if mpu:
             methods = ["get_model_parallel_group", "get_data_parallel_group"]
             for method in methods:
                 if not hasattr(mpu, method):
                     raise ValueError(f"mpu is missing {method}")
-        if self.checkpoint is not None and not isinstance(self.checkpoint, str):
+        if self._config.checkpoint is not None and not isinstance(
+                self._config.checkpoint,
+            (str,
+             dict)):
             raise ValueError(
-                f"checkpoint must be None or a str, got {type(self.checkpoint)}")
+                f"checkpoint must be None, str or dict, got {type(self._config.checkpoint)}"
+            )
 
         supported_dtypes = [None, torch.half, torch.int8, torch.float]
-        if self.dtype not in supported_dtypes:
+        if self._config.dtype not in supported_dtypes:
             raise ValueError(
-                f"{self.dtype} not supported, valid dtype: {supported_dtypes}")
+                f"{self._config.dtype} not supported, valid dtype: {supported_dtypes}")
 
         if self.injection_dict is not None and not isinstance(self.injection_dict, dict):
             raise ValueError(
                 f"injection_dict must be None or a dict, got: {self.injection_dict}")
 
-    def _apply_injection_policy(self,
-                                client_module=None,
-                                injection_policy=None,
-                                return_tuple=True,
-                                replace_with_kernel_inject=False,
-                                moe=False,
-                                moe_experts=1,
-                                moe_type='standard',
-                                training_mp_size=1):
-
-        replace_transformer_layer(client_module,
-                                  self.module,
-                                  triangular_masking=self.triangular_masking,
-                                  policy=injection_policy,
-                                  mp_size=self.mp_world_size,
-                                  mp_group=self.mp_group,
-                                  ep_group=self.ep_group,
-                                  expert_mp_group=self.expert_mp_group,
-                                  config=self.config,
-                                  fp16=(self.dtype == torch.half),
-                                  training=False,
-                                  return_tuple=return_tuple,
-                                  quantize=(self.dtype == torch.int8),
-                                  quantize_settings=(self.quantization_scales,
-                                                     self.quantize_merge_count,
-                                                     self.mlp_extra_grouping,
-                                                     self.quantize_groups),
-                                  replace_with_kernel_inject=replace_with_kernel_inject,
-                                  moe=moe,
-                                  moe_experts=moe_experts,
-                                  moe_type=moe_type,
-                                  training_mp_size=training_mp_size)
+    def load_model_with_checkpoint(self, r_module):
+        self.mp_replace = ReplaceWithTensorSlicing(
+            mp_group=self.mp_group,
+            mp_size=self._config.tensor_parallel.tp_size)  #, out_dim=0, in_dim=1)
+        error_msgs = []
+
+        def load(module, state_dict, prefix):
+            args = (state_dict, prefix, {}, True, [], [], error_msgs)
+            if hasattr(module, 'weight'):
+                if 'query_key_value' in prefix:
+                    module.weight = self.mp_replace.qkv_copy(
+                        module.weight.data,
+                        state_dict[prefix + 'weight'])
+                else:
+                    module.weight = self.mp_replace.copy(module.weight.data,
+                                                         state_dict[prefix + 'weight'])
+            else:
+                module.norm.weight = self.mp_replace.copy(module.norm.weight.data,
+                                                          state_dict[prefix + 'weight'])
+            if prefix + 'bias' in self.key_list:
+                if hasattr(module, 'norm'):
+                    module.norm.bias = self.mp_replace.copy(module.norm.bias,
+                                                            state_dict[prefix + 'bias'])
+                else:
+                    data = state_dict[prefix + 'bias']
+                    data = data.to(get_accelerator().current_device_name())
+                    module.bias = self.mp_replace.copy(module.bias, data)
+
+        layer_policies = {
+            nn.Linear: load,
+            nn.Embedding: load,
+            nn.LayerNorm: load,
+            LinearLayer: load,
+            LinearAllreduce: load
+        }
+
+        def load_module_recursive(module, prefix='', level=0):
+            for name, child in module.named_children():
+                if child.__class__ in layer_policies:
+                    checking_key = prefix + name + '.'
+                    if not any(checking_key in item for item in self.key_list):
+                        continue
+                    if len(list(child.parameters())) > 0 and list(
+                            child.parameters())[0].numel() == 0:
+                        if len(child.weight.ds_shape) == 1:
+                            child = Normalize(dim=child.weight.ds_shape[-1],
+                                              dtype=child.weight.dtype,
+                                              eps=child.eps)
+                            setattr(module, name, child)
+                    load(child, self.sd, prefix + name + '.')
+                else:
+                    load_module_recursive(child,
+                                          prefix if level == 0 else prefix + name + '.',
+                                          level + 1)
+
+        load_module_recursive(r_module)
+
+    def _apply_injection_policy(self, config, client_module=None):
+        # client_module is only passed when using the injection_dict method.
+        checkpoint_dir = config.checkpoint
+        checkpoint = SDLoaderFactory.get_sd_loader_json(
+            checkpoint_dir,
+            self.checkpoint_engine) if checkpoint_dir is not None else None
+
+        generic_injection(self.module,
+                          fp16=(config.dtype == torch.half)
+                          or (config.dtype == torch.int8),
+                          enable_cuda_graph=config.enable_cuda_graph)
+
+        if isinstance(self.module, torch.nn.Module):
+            # config is our DeepSpeedInferenceConfig and self.config is the HF model config
+            replace_transformer_layer(client_module,
+                                      self.module,
+                                      checkpoint,
+                                      config,
+                                      self.config)
 
     def _get_all_ckpt_names(self, checkpoints_path, tag):
         ckpt_file_pattern = self._get_ckpt_name(checkpoints_path,
@@ -283,7 +394,7 @@ class InferenceEngine(Module):
         if is_pipe_parallel:
             raise RuntimeError(
                 'pipeline parallelism is currently not supported in inference.')
-        if os.path.isdir(load_dir):
+        if not isinstance(load_dir, dict) and os.path.isdir(load_dir):
             if tag is None:
                 latest_path = os.path.join(load_dir, "latest")
                 if os.path.isfile(latest_path):
@@ -291,38 +402,54 @@ class InferenceEngine(Module):
                         tag = fd.read().strip()
 
             ckpt_list = self._get_all_ckpt_names(load_dir, tag)
-            sd_loader = SDLoaderFactory.get_sd_loader(ckpt_list)
+            sd_loader = SDLoaderFactory.get_sd_loader(ckpt_list, self.checkpoint_engine)
         else:
-            sd_loader = SDLoaderFactory.get_sd_loader_json(load_dir)
-
-        mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
-
-        load_path, checkpoint, quantize_config = sd_loader.load(self.mp_world_size,
-                                                  mp_rank,
-                                                  is_pipe_parallel=is_pipe_parallel,
-                                                  quantize=(self.dtype is torch.int8),
-                                                  quantize_groups=self.quantize_groups,
-                                                  mlp_extra_grouping=self.mlp_extra_grouping)
-
-        self.quantization_scales, self.quantize_merge_count = quantize_config
-
-        moe, _ = has_moe_layers(self.module)
-        if moe:
-            from deepspeed.runtime.engine import DeepSpeedEngine
-            old_moe_load = False
-            if not isinstance(checkpoint['num_experts'], list):
-                old_moe_load = True
-            DeepSpeedEngine.load_moe_state_dict(
-                load_dir,
-                tag,
-                state_dict=checkpoint[self._choose_module_key(checkpoint)],
-                old_moe_load=old_moe_load,
-                model=self.module,
-                mpu=self.mpu)
+            sd_loader = SDLoaderFactory.get_sd_loader_json(load_dir,
+                                                           self.checkpoint_engine)
+
+        if type(sd_loader) is list:
+            self.sd = torch.load(sd_loader[0], map_location='cpu')
+            self.key_list = list(self.sd.keys())
+
+            self.load_model_with_checkpoint(self.module)
+
+            for i in range(1, len(sd_loader)):
+                if not dist.is_initialized() or dist.get_rank() == 0:
+                    print(f"loading checkpoint ({i})")
+                self.sd = torch.load(sd_loader[i],
+                                     map_location=get_accelerator().device_name())
+                self.key_list = list(self.sd.keys())
+                self.load_model_with_checkpoint(self.module)
+        else:
+            mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
 
-        self.module.load_state_dict(
-            state_dict=checkpoint[self._choose_module_key(checkpoint)],
-            strict=load_module_strict)
+            load_path, checkpoint, quantize_config = sd_loader.load(self._config.tensor_parallel.tp_size,
+                                                    mp_rank,
+                                                    is_pipe_parallel=is_pipe_parallel,
+                                                    quantize=(self._config.dtype is torch.int8),
+                                                    quantize_groups=self.quantize_groups,
+                                                    mlp_extra_grouping=self.mlp_extra_grouping)
+
+            self.quantization_scales, self.quantize_merge_count = quantize_config
+
+            moe, _ = has_moe_layers(self.module)
+            if moe:
+                from deepspeed.runtime.engine import DeepSpeedEngine
+                old_moe_load = False
+                if not isinstance(checkpoint['num_experts'], list):
+                    old_moe_load = True
+                DeepSpeedEngine.load_moe_state_dict(
+                    load_dir,
+                    tag,
+                    state_dict=checkpoint[self._choose_module_key(checkpoint)],
+                    old_moe_load=old_moe_load,
+                    model=self.module,
+                    mpu=self.mpu,
+                    checkpoint_engine=self.checkpoint_engine)
+
+            self.module.load_state_dict(
+                state_dict=checkpoint[self._choose_module_key(checkpoint)],
+                strict=load_module_strict)
 
     def _choose_module_key(self, sd):
         assert not ('module' in sd and 'model' in sd), "checkpoint has both 'model' and 'module' keys, not sure how to proceed"
@@ -332,25 +459,84 @@ class InferenceEngine(Module):
         elif 'model' in sd:
             return 'model'
 
-    def _convert_to_dtype(self):
-        if self.dtype is torch.int8 and self.quantization_scales is None:
+    def _convert_to_dtype(self, config):
+        if not isinstance(self.module, torch.nn.Module):
+            return
+
+        if False:  #config.dtype is torch.int8 and self.quantization_scales is None:
             quantizer = WeightQuantization(mlp_extra_grouping=self.mlp_extra_grouping)
             model, self.quantization_scales = quantizer.model_quantize(self.module,
                                                                         self.injection_dict,
                                                                         self.quantize_bits,
                                                                         self.quantize_groups)
-        elif self.dtype == torch.half:
+        elif config.dtype == torch.half:
             self.module.half()
-        elif self.dtype == torch.float:
+        elif config.dtype == torch.bfloat16:
+            self.module.bfloat16()
+        elif config.dtype == torch.float:
             self.module.float()
 
-    def _pre_forward_hook(self, module, *inputs, **kwargs):
-        for input in inputs:
-            if torch.is_tensor(input):
-                input = input.to(torch.cuda.current_device())
+    def _create_cuda_graph(self, *inputs, **kwargs):
+        # warmup to create the workspace and cublas handle
+        cuda_stream = get_accelerator().Stream()
+        cuda_stream.wait_stream(get_accelerator().current_stream())
+        with get_accelerator().stream(cuda_stream):
+            for i in range(3):
+                ret = self.module(*inputs, **kwargs)
+        get_accelerator().current_stream().wait_stream(cuda_stream)
+
+        # create cuda_graph and assign static_inputs and static_outputs
+        self._cuda_graphs = torch.cuda.CUDAGraph()
+        self.static_inputs = inputs
+        self.static_kwargs = kwargs
+
+        with torch.cuda.graph(self._cuda_graphs):
+            self.static_output = self.module(*self.static_inputs, **self.static_kwargs)
+
+        self.cuda_graph_created = True
+
+    def _graph_replay(self, *inputs, **kwargs):
+        for i in range(len(inputs)):
+            if torch.is_tensor(inputs[i]):
+                self.static_inputs[i].copy_(inputs[i])
         for k in kwargs:
             if torch.is_tensor(kwargs[k]):
-                kwargs[k] = kwargs[k].to(torch.cuda.current_device())
+                self.static_kwargs[k].copy_(kwargs[k])
+        self._cuda_graphs.replay()
+        return self.static_output
+
+    def model_times(self):
+        assert self.model_profile_enabled, "model profiling is not enabled"
+        model_times = self._model_times
+        if self._config.enable_cuda_graph and len(self._model_times) == 0:
+            raise ValueError(
+                "Model times are empty and cuda graph is enabled. If "
+                "this is a GPT-style model this combo is not supported. If this is a "
+                "BERT-style model this is a bug, please report it. "
+                f"Model type is: {type(self.module)}")
+        self._model_times = []
+        return model_times
+
+    def _module_match(self, module):
+        for policy in generic_policies:
+            policy = policy()
+            if policy.match_replaced(module):
+                return True
+        return False
+
+    def _local_cuda_graph_used(self, module):
+        if isinstance(module, torch.nn.Module):
+            return False
+        else:
+            sub_module_cuda_graph = False
+            for name in module.__dict__.keys():
+                sub_module = getattr(module, name)
+
+                if self._module_match(sub_module) and hasattr(sub_module,
+                                                              "enable_cuda_graph"):
+                    sub_module_cuda_graph = True
+
+            return sub_module_cuda_graph
 
     def forward(self, *inputs, **kwargs):
         """Execute forward propagation
@@ -359,22 +545,44 @@ class InferenceEngine(Module):
             *inputs: Variable length input list
             **kwargs: variable length keyword arguments
         """
-        if self.mp_world_size > 1:
-            if self.mpu is None:
-                for input in inputs:
-                    if torch.is_tensor(input):
-                        input = input.to(torch.cuda.current_device())
-                        if not input.is_contiguous():
-                            input = input.contiguous()
-                        dist.broadcast(input, 0)
-                for k in kwargs:
-                    if torch.is_tensor(kwargs[k]):
-                        kwargs[k] = kwargs[k].to(torch.cuda.current_device())
-                        if not kwargs[k].is_contiguous():
-                            kwargs[k] = kwargs[k].contiguous()
-                        dist.broadcast(kwargs[k], 0)
-
-            outputs = self.model_orig_fwd(*inputs, **kwargs)
+        start = None
+        if self.model_profile_enabled and get_accelerator().device_name(
+        ) == 'cuda' and self._config.enable_cuda_graph:
+            get_accelerator().synchronize()
+            start = time.time()
+
+        if get_accelerator().device_name(
+        ) == 'cuda' and self._config.enable_cuda_graph and not self.local_cuda_graph:
+            if self.cuda_graph_created:
+                outputs = self._graph_replay(*inputs, **kwargs)
+            else:
+                self._create_cuda_graph(*inputs, **kwargs)
+                outputs = self._graph_replay(*inputs, **kwargs)
         else:
             outputs = self.module(*inputs, **kwargs)
+
+        if self.model_profile_enabled and self._config.enable_cuda_graph:
+            get_accelerator().synchronize()
+            duration = time.time() - start
+            self._model_times.append(duration)
+
         return outputs
+
+    def _generate(self, *inputs, **kwargs):
+        # Reset KV-cache at the beginning of generate
+        if hasattr(self.module, 'reset_cache'):
+            self.module.reset_cache()
+        num_beams = 1
+        if "generation_config" in kwargs:
+            gen_config = kwargs["generation_config"]
+            num_beams = getattr(gen_config, "num_beams", 1)
+        if "num_beams" in kwargs:
+            num_beams = kwargs["num_beams"]
+
+        if num_beams > 1:
+            raise NotImplementedError(
+                "DeepSpeed does not support `num_beams` > 1, if this is important to you please "
+                "add your request to: https://github.com/microsoft/DeepSpeed/issues/2506"
+            )
+
+        return self.module.generate(*inputs, **kwargs)
diff --git a/deepspeed/launcher/__init__.py b/deepspeed/launcher/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..fcb45ab2b68516814a4bfbffebf2e01cbfefd527 100644
--- a/deepspeed/launcher/__init__.py
+++ b/deepspeed/launcher/__init__.py
@@ -0,0 +1 @@
+'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/launcher/constants.py b/deepspeed/launcher/constants.py
index 15b262342d475f10322b2a8a120eb08b95571422..09dd1bdd1def7e7866104edd21d0d4e9ae19fd8b 100644
--- a/deepspeed/launcher/constants.py
+++ b/deepspeed/launcher/constants.py
@@ -4,6 +4,9 @@ PDSH_LAUNCHER = 'pdsh'
 PDSH_MAX_FAN_OUT = 1024
 
 OPENMPI_LAUNCHER = 'openmpi'
-
+MPICH_LAUNCHER = 'mpich'
+SLURM_LAUNCHER = 'slurm'
 MVAPICH_LAUNCHER = 'mvapich'
 MVAPICH_TMP_HOSTFILE = '/tmp/deepspeed_mvapich_hostfile'
+
+ELASTIC_TRAINING_ID_DEFAULT = "123456789"
diff --git a/deepspeed/launcher/launch.py b/deepspeed/launcher/launch.py
old mode 100644
new mode 100755
index 21ef04b338b78c7c3a11be36e9fb08046ba2a9a3..d9e6fda504b4b5cb360997aa526b2381d890156c
--- a/deepspeed/launcher/launch.py
+++ b/deepspeed/launcher/launch.py
@@ -1,6 +1,6 @@
 # Copyright 2020 The Microsoft DeepSpeed Team
 """
-DeepSpeed launcher, this is similar to torch.distributed.launch but supports
+DeepSpeed launcher, this is similar to torch's distributed.launch but supports
 additional features such as arbitrary gpu exclusion.
 
 deepspeed.launcher.launch is intended to be run on a single worker node and
@@ -15,11 +15,15 @@ import json
 import base64
 import time
 import signal
+import psutil
 from collections import defaultdict
+from typing import Dict
 from argparse import ArgumentParser, REMAINDER
-
 from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
+from ..nebula.constants import DLTS_POD_ENV_PATH
 from ..utils import logger
+from ..elasticity import is_torch_elastic_compatible
+from .constants import ELASTIC_TRAINING_ID_DEFAULT
 
 PID_FILE_BASEPATH = "/tmp"
 
@@ -64,6 +68,20 @@ def parse_args():
                         help="Skip prepending the training script with "
                         "'python' - just execute it directly.")
 
+    parser.add_argument("--enable_elastic_training",
+                        action="store_true",
+                        help="Enable elastic training support.")
+
+    parser.add_argument("--min_elastic_nodes",
+                        type=int,
+                        default=-1,
+                        help="Min number of nodes in elastic training.")
+
+    parser.add_argument("--max_elastic_nodes",
+                        type=int,
+                        default=-1,
+                        help="Max number of nodes in elastic training.")
+
     parser.add_argument("--no_local_rank",
                         action="store_true",
                         help="Do not pass local_rank as an argument when calling "
@@ -74,6 +92,12 @@ def parse_args():
                         default=0,
                         help="main launching process pid, for internal pid tracking")
 
+    parser.add_argument(
+        "--enable_each_rank_log",
+        default="None",
+        type=str,
+        help="redirect the stdout and stderr from each rank into different log files")
+
     # positional
     parser.add_argument("training_script",
                         type=str,
@@ -87,6 +111,21 @@ def parse_args():
     return parser.parse_args()
 
 
+# Adapted from https://psutil.readthedocs.io/en/latest/#kill-process-tree
+def terminate_process_tree(pid):
+    process = psutil.Process(pid)
+    children = process.children(recursive=True)
+    children.append(process)
+    for child in children:
+        try:
+            child.terminate()
+        except psutil.NoSuchProcess:
+            pass
+    gone, alive = psutil.wait_procs(children, timeout=30)
+    for p in alive:
+        p.kill()
+
+
 def main():
     args = parse_args()
     current_env = os.environ.copy()
@@ -143,15 +182,93 @@ def main():
         with open(pid_file, 'w') as fd:
             fd.write(f"{launcher_pid}")
 
+    if not is_torch_elastic_compatible():
+        if args.enable_elastic_training:
+            logger.info(f"Disabling elastic training support as \
+                    PyTorch version should be greater than 1.11.x")
+            args.enable_elastic_training = False
+
+    if os.path.exists(DLTS_POD_ENV_PATH):
+        with open(DLTS_POD_ENV_PATH) as file:
+            lines = file.readlines()
+            lines = [line.rstrip() for line in lines]
+            for line in lines:
+                if line.startswith('export FC_TASKROLE_NAME') or line.startswith(
+                        'export FC_TASK_INDEX'):
+                    key_val = line.split()[1]
+                    key, val = key_val.split('=')
+                    current_env[key] = val
+
     processes = []
     cmd = []
-    for local_rank in range(0, num_local_procs):
-        # each process's rank
-        dist_rank = global_rank_mapping[local_node][local_rank]
-        current_env["RANK"] = str(dist_rank)
-        current_env["LOCAL_RANK"] = str(local_rank)
 
-        # spawn the processes
+    if not args.enable_elastic_training:
+        if args.enable_each_rank_log != "None":
+            # prepare the log path and the file name prefix
+            if os.path.isfile(args.enable_each_rank_log):
+                raise ValueError(
+                    f"{args.enable_each_rank_log} should not be a file, it should be a directory."
+                )
+            if not os.path.exists(args.enable_each_rank_log):
+                try:
+                    os.makedirs(args.enable_each_rank_log)
+                except Exception as e:
+                    print(e)
+                    raise ValueError(
+                        f"unable to create directory {args.enable_each_rank_log} for each rank log."
+                    )
+            log_name_prefix = time.strftime("%Y%m%d%H%M%S", time.localtime())
+
+        for local_rank in range(0, num_local_procs):
+            # each process's rank
+            dist_rank = global_rank_mapping[local_node][local_rank]
+            current_env["RANK"] = str(dist_rank)
+            current_env["LOCAL_RANK"] = str(local_rank)
+
+            # spawn the processes
+            cmd = []
+            if not args.no_python:
+                cmd = [sys.executable, "-u"]
+                if args.module:
+                    cmd.append("-m")
+            else:
+                if args.module:
+                    raise ValueError("Don't use both the '--no_python' flag"
+                                     " and the '--module' flag at the same time.")
+            cmd.append(args.training_script)
+            # A user may not want to pass local_rank as a keyword arg so we make this optional.
+            if not args.no_local_rank:
+                cmd.append(f"--local_rank={local_rank}")
+            cmd += args.training_script_args
+
+            if args.enable_each_rank_log != "None":
+                log_file = os.path.join(args.enable_each_rank_log,
+                                        f"{log_name_prefix}_rank{dist_rank}.log")
+                log_fd = open(log_file, 'w')
+                process = subprocess.Popen(cmd,
+                                           env=current_env,
+                                           stdout=log_fd,
+                                           stderr=log_fd)
+            else:
+                process = subprocess.Popen(cmd, env=current_env)
+
+            processes.append(process)
+    else:
+        from ..elasticity import DSElasticAgent
+        from torch.distributed.elastic.rendezvous import RendezvousParameters
+        from torch.distributed.elastic.agent.server.api import WorkerSpec
+        import torch.distributed.elastic.rendezvous.registry as rdzv_registry
+        from torch.distributed.elastic.multiprocessing import Std
+
+        if args.min_elastic_nodes == -1:
+            args.min_elastic_nodes = 1
+        if args.max_elastic_nodes == -1:
+            args.max_elastic_nodes = args.nnodes
+        assert args.max_elastic_nodes > 0 and  args.min_elastic_nodes > 0 , "Max and Min nodes should be positive"
+
+        current_env["NCCL_ASYNC_ERROR_HANDLING"] = str(1)
+
+        # Get config and arguments
         cmd = []
         if not args.no_python:
             cmd = [sys.executable, "-u"]
@@ -162,13 +279,36 @@ def main():
                 raise ValueError("Don't use both the '--no_python' flag"
                                  " and the '--module' flag at the same time.")
         cmd.append(args.training_script)
-        # A user may not want to pass local_rank as a keyword arg so we make this optional.
-        if not args.no_local_rank:
-            cmd.append(f"--local_rank={local_rank}")
         cmd += args.training_script_args
-
-        process = subprocess.Popen(cmd, env=current_env)
-        processes.append(process)
+        cmd_args = cmd[1:]
+
+        rdzv_configs: Dict[str, str] = {'timeout': 100}
+        run_id = os.environ.get("ELASTIC_RUN_ID", ELASTIC_TRAINING_ID_DEFAULT)
+
+        # Creating config for rendezvous class
+        rdzv_parameters = RendezvousParameters(backend='c10d',
+                                               endpoint=args.master_addr + ":" +
+                                               str(args.master_port),
+                                               run_id=run_id,
+                                               min_nodes=args.min_elastic_nodes,
+                                               max_nodes=args.max_elastic_nodes,
+                                               **rdzv_configs)
+
+        spec = WorkerSpec(
+            role='trainer',
+            local_world_size=num_local_procs,
+            entrypoint=cmd[0],
+            args=cmd[1:],
+            rdzv_handler=rdzv_registry.get_rendezvous_handler(rdzv_parameters),
+            max_restarts=100,
+            monitor_interval=5,
+            redirects=Std.from_str("0"),
+            tee=Std.from_str("0"),
+            master_addr=None,
+            master_port=None,
+        )
+        agent = DSElasticAgent(spec, current_env)
+        agent.run()
 
     sig_names = {2: "SIGINT", 15: "SIGTERM"}
     last_return_code = None
@@ -177,7 +317,7 @@ def main():
         for process in processes:
             logger.info(f"Killing subprocess {process.pid}")
             try:
-                process.kill()
+                terminate_process_tree(process.pid)
             except Exception:
                 pass
         if last_return_code is not None:
diff --git a/deepspeed/launcher/multinode_runner.py b/deepspeed/launcher/multinode_runner.py
index a962a8a7c925955b05c99d997990f46b6fee7592..6027d1076e80ac5e9916008153a756857a94523b 100644
--- a/deepspeed/launcher/multinode_runner.py
+++ b/deepspeed/launcher/multinode_runner.py
@@ -1,11 +1,13 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import os
 import sys
 import shutil
 import subprocess
 import warnings
-from shlex import quote
+from shlex import split
 from abc import ABC, abstractmethod
-
+from deepspeed.accelerator import get_accelerator
 from ..utils import logger
 from .constants import PDSH_MAX_FAN_OUT, MVAPICH_TMP_HOSTFILE
 
@@ -66,7 +68,14 @@ class PDSHRunner(MultiNodeRunner):
 
         # PDSH flags for max node fan out and specific hosts to launch on
         # See https://linux.die.net/man/1/pdsh for flag details
-        pdsh_cmd_args = ['pdsh', '-f', str(PDSH_MAX_FAN_OUT), '-w', active_workers]
+        pdsh_cmd_args = [
+            'pdsh',
+            '-S',
+            '-f',
+            str(PDSH_MAX_FAN_OUT),
+            '-w',
+            active_workers
+        ] + split(self.args.launcher_args)
 
         exports = ""
         for key, val in self.exports.items():
@@ -94,8 +103,16 @@ class PDSHRunner(MultiNodeRunner):
             deepspeed_launch.append("--no_local_rank")
         if self.args.save_pid:
             deepspeed_launch += ["--save_pid", f"{os.getpid()}"]
+        if self.args.elastic_training:
+            deepspeed_launch.append("--enable_elastic_training")
+            deepspeed_launch.append(f"--max_elastic_nodes={self.args.max_elastic_nodes}")
+            deepspeed_launch.append(f"--min_elastic_nodes={self.args.min_elastic_nodes}")
+
+        cmd_to_search = [i + "\\" for i in deepspeed_launch[2:6]]
+
+        kill_command = pdsh_cmd_args + ["pkill -f ", " ".join(cmd_to_search)[:-2]]
         return pdsh_cmd_args + deepspeed_launch + [self.user_script
-                                                   ] + self.user_arguments
+                                                   ] + self.user_arguments, kill_command
 
 
 class OpenMPIRunner(MultiNodeRunner):
@@ -137,7 +154,7 @@ class OpenMPIRunner(MultiNodeRunner):
             '--mca',
             'btl_tcp_if_include',
             'eth0',
-        ]
+        ] + split(self.args.launcher_args)
 
         export_cmd = []
         for k, v in self.exports.items():
@@ -153,6 +170,102 @@ class OpenMPIRunner(MultiNodeRunner):
                                                         ] + self.user_arguments
 
 
+class MPICHRunner(MultiNodeRunner):
+    def __init__(self, args, world_info_base64, resource_pool):
+        super().__init__(args, world_info_base64)
+        self.resource_pool = resource_pool
+
+    def backend_exists(self):
+        #TODO: if IB is available we should suggestion mpich
+        return shutil.which('mpirun')  #mpich_info
+
+    @property
+    def name(self):
+        return "mpich"
+
+    def validate_args(self):
+        super().validate_args()
+        #TODO: Allow for include/exclude at node-level but not gpu-level
+        if self.args.include != "" or self.args.exclude != "":
+            raise ValueError(
+                f"{self.name} backend does not support worker include/exclusion")
+
+        if self.args.num_nodes != -1 or self.args.num_gpus != -1:
+            raise ValueError(
+                f"{self.name} backend does not support limiting num nodes/gpus")
+
+    def get_cmd(self, environment, active_resources):
+        devices_per_node = self.resource_pool.values()
+        total_process_count = sum(devices_per_node)
+        process_per_node = list(devices_per_node)[0]
+
+        mpirun_cmd = [
+            'mpirun',
+            '-n',
+            f'{total_process_count}',
+            '-ppn',
+            f'{process_per_node}',
+        ] + split(self.args.launcher_args)
+        export_cmd = []
+
+        for k, v in self.exports.items():
+            export_cmd += ['-x', "{}={}".format(k, v)]
+
+        python_exec = []
+        if not self.args.no_python:
+            python_exec = [sys.executable, "-u"]
+            if self.args.module:
+                python_exec.append("-m")
+        return mpirun_cmd + python_exec + [self.user_script] + self.user_arguments
+
+
+class SlurmRunner(MultiNodeRunner):
+    def __init__(self, args, world_info_base64, resource_pool):
+        super().__init__(args, world_info_base64)
+        self.resource_pool = resource_pool
+
+    def backend_exists(self):
+        return shutil.which('sinfo')
+
+    @property
+    def name(self):
+        return 'slurm'
+
+    def get_cmd(self, environment, active_resources):
+        assert not getattr(self.args, 'detect_nvlink_pairs', False), "slurm backend does not support remapping visible devices"
+        total_process_count = sum(self.resource_pool.values())
+        srun_cmd = [
+            'srun',
+            '-n',
+            f'{total_process_count}',
+        ] + split(self.args.launcher_args)
+
+        if getattr(self.args, 'slurm_comment', ''):
+            srun_cmd += ['--comment', self.args.slurm_comment]
+
+        if self.args.include != "":
+            srun_cmd.append('--include')
+            srun_cmd.append(f'{self.args.include}')
+        if self.args.exclude != "":
+            srun_cmd.append('--exclude')
+            srun_cmd.append(f'{self.args.exclude}')
+        if self.args.num_nodes > 0:
+            srun_cmd.append('--nodes')
+            srun_cmd.append(f'{self.args.num_nodes}')
+        if self.args.num_gpus > 0:
+            srun_cmd.append('--gpus')
+            srun_cmd.append(f'{self.args.num_gpus}')
+
+        exports = '--export=ALL'
+        for key, val in self.exports.items():
+            exports += f",{key}={val}"
+
+        python_exec = [sys.executable, "-u"]
+        command = srun_cmd + [exports] + python_exec + [self.user_script
+                                                        ] + self.user_arguments
+        return command
+
+
 class MVAPICHRunner(MultiNodeRunner):
     def __init__(self, args, world_info_base64, resource_pool):
         super().__init__(args, world_info_base64)
@@ -165,7 +278,8 @@ class MVAPICHRunner(MultiNodeRunner):
         self.add_export('MV2_DEBUG_SHOW_BACKTRACE', '1')
 
         # Enabled cuda-aware communication
-        self.add_export('MV2_USE_CUDA', '1')
+        if get_accelerator().device_name() == 'cuda':
+            self.add_export('MV2_USE_CUDA', '1')
 
         # Support deep learning frameworks: http://hidl.cse.ohio-state.edu/userguide/horovod/
         self.add_export('MV2_SUPPORT_DL', '1')
@@ -227,7 +341,7 @@ class MVAPICHRunner(MultiNodeRunner):
             f'{process_per_node}',
             '--hostfile',
             f'{MVAPICH_TMP_HOSTFILE}',
-        ]
+        ] + split(self.args.launcher_args)
 
         export_cmd = []
         for k, v in self.exports.items():
diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
old mode 100644
new mode 100755
index 551d24e0e1d1191c546b11413399fbe880b165be..044f6ff03365628be3fa8f774d33462057fa9824
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@@ -7,6 +7,7 @@ per rank for training.
 """
 
 import os
+import re
 import sys
 import json
 import base64
@@ -14,18 +15,21 @@ import argparse
 import subprocess
 import collections
 from copy import deepcopy
+import signal
+import time
 
-import torch.cuda
-
-from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner
-from .constants import PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER
+from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner, SlurmRunner, MPICHRunner
+from .constants import PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER, SLURM_LAUNCHER, MPICH_LAUNCHER
 from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
+from ..nebula.constants import NEBULA_EXPORT_ENVS
 from ..utils import logger
 
 from ..autotuning import Autotuner
+from deepspeed.accelerator import get_accelerator
 
 DLTS_HOSTFILE = "/job/hostfile"
-EXPORT_ENVS = ["NCCL", "PYTHON", "MV2", "UCX"]
+EXPORT_ENVS = ['MLFLOW', 'NCCL', 'PYTHON', 'MV2', 'UCX']
+EXPORT_ENVS += NEBULA_EXPORT_ENVS
 DEEPSPEED_ENVIRONMENT_NAME = ".deepspeed_env"
 DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.']
 PDSH_MAX_FAN_OUT = 1024
@@ -75,6 +79,18 @@ def parse_args(args=None):
                         help="Total number of worker nodes to run on, this will use "
                         "the top N hosts from the given hostfile.")
 
+    parser.add_argument("--min_elastic_nodes",
+                        type=int,
+                        default=-1,
+                        help="Minimum number of nodes to run elastic training on. "
+                        "Default is 1 when elastic training is enabled")
+
+    parser.add_argument("--max_elastic_nodes",
+                        type=int,
+                        default=-1,
+                        help="Maximum number of nodes to run elastic training on. "
+                        "Default is num_nodes when elastic training is enabled")
+
     parser.add_argument("--num_gpus",
                         type=int,
                         default=-1,
@@ -93,11 +109,12 @@ def parse_args(args=None):
                         help="(optional) IP address of node 0, will be "
                         "inferred via 'hostname -I' if not specified.")
 
-    parser.add_argument("--launcher",
-                        default=PDSH_LAUNCHER,
-                        type=str,
-                        help="(optional) choose launcher backend for multi-node "
-                        "training. Options currently include PDSH, OpenMPI, MVAPICH.")
+    parser.add_argument(
+        "--launcher",
+        default=PDSH_LAUNCHER,
+        type=str,
+        help="(optional) choose launcher backend for multi-node "
+        "training. Options currently include PDSH, OpenMPI, MVAPICH, SLURM, MPICH.")
 
     parser.add_argument("--launcher_args",
                         default="",
@@ -121,6 +138,10 @@ def parse_args(args=None):
                         help="Do not pass local_rank as an argument when calling "
                         "the user's training script.")
 
+    parser.add_argument("--no_ssh_check",
+                        action="store_true",
+                        help="Do not perform ssh check in multi-node launcher model")
+
     parser.add_argument("--force_multi",
                         action="store_true",
                         help="Force multi-node launcher mode, helps in cases where user "
@@ -133,6 +154,12 @@ def parse_args(args=None):
         "where <main-pid> is the pid of the first process that invoked `deepspeed`. "
         "Useful when launching deepspeed processes programmatically.")
 
+    parser.add_argument(
+        "--enable_each_rank_log",
+        default="None",
+        type=str,
+        help="redirect the stdout and stderr from each rank into different log files")
+
     parser.add_argument(
         "--autotuning",
         default="",
@@ -142,6 +169,10 @@ def parse_args(args=None):
         help="Run DeepSpeed autotuner to discover optimal configuration parameters "
         "before running job.")
 
+    parser.add_argument("--elastic_training",
+                        action="store_true",
+                        help="Enable elastic training support in DeepSpeed.")
+
     parser.add_argument("user_script",
                         type=str,
                         help="User script to launch, followed by any required "
@@ -158,25 +189,45 @@ def fetch_hostfile(hostfile_path):
 
     # e.g., worker-0 slots=16
     with open(hostfile_path, 'r') as fd:
-        resource_pool = collections.OrderedDict()
-        for line in fd.readlines():
-            line = line.strip()
-            if line == '':
-                # skip empty lines
-                continue
-            try:
-                hostname, slots = line.split()
-                _, slot_count = slots.split("=")
-                slot_count = int(slot_count)
-            except ValueError as err:
-                logger.error("Hostfile is not formatted correctly, unable to "
-                             "proceed with training.")
-                raise err
-            if hostname in resource_pool:
-                logger.error("Hostfile contains duplicate hosts, unable to "
-                             "proceed with training.")
-                raise ValueError(f"host {hostname} is already defined")
-            resource_pool[hostname] = slot_count
+        hostfile_text = fd.readlines()
+
+    return _parse_hostfile(hostfile_text)
+
+
+def _parse_hostfile(hostfile_lines):
+    # Regex matches one or more non-whitespace characters (\S+) at the start of
+    # the line, followed by one or more whitespace characters (\s+), followed
+    # by the string "slots=", followed by one or more digits (\d+).
+    pattern = r'^(\S+)\s+slots=(\d+)'
+
+    resource_pool = collections.OrderedDict()
+
+    for line in hostfile_lines:
+        line = line.strip()
+        match = re.search(pattern, line)
+        if line.startswith("#") or line == "":
+            # hostfile comment or empty line, ignore
+            continue
+        elif match:
+            host = match.group(1)
+            num_slots = int(match.group(2))
+            if host in resource_pool:
+                logger.error(f"Bad hostfile text: {hostfile_lines}")
+                raise ValueError(
+                    f"Hostfile contains multiple entries for {host}, unable to proceed with launching"
+                )
+            resource_pool[host] = num_slots
+        else:
+            logger.error(f"Bad hostfile text: {hostfile_lines}")
+            raise ValueError(
+                "Hostfile contains a bad entry: {line}, unable to proceed with launching"
+            )
+
+    if len(resource_pool) == 0:
+        logger.error(f"Bad hostfile text: {hostfile_lines}")
+        raise ValueError(
+            "Hostfile is empty or not formatted correctly, unable to proceed with launching."
+        )
 
     return resource_pool
 
@@ -305,14 +356,33 @@ def run_autotuning(args, active_resources):
     tuner.print_tuning_results()
 
     logger.info("[End] Running autotuning")
+    tuner.write_optimal_config()
 
     if args.autotuning == "run":
         tuner.run_after_tuning()
 
 
+def parse_num_nodes(str_num_nodes: str, elastic_training: bool):
+    node_list = str_num_nodes.split(":")
+
+    if len(node_list) == 1:
+        min_nodes, max_nodes = int(node_list[0]), -1
+    elif len(node_list) == 2 and elastic_training:
+        min_nodes, max_nodes = int(node_list[0]), int(node_list[1])
+    elif len(node_list) == 2 and not elastic_training:
+        raise RuntimeError("MIN:MAX format is only supported in elastic training")
+    else:
+        raise RuntimeError("num_nodes {} is not in MIN:MAX format".format(str_num_nodes))
+
+    return min_nodes, max_nodes
+
+
 def main(args=None):
     args = parse_args(args)
 
+    if args.elastic_training:
+        assert args.master_addr != "", "Master Addr is required when elastic training is enabled"
+
     resource_pool = fetch_hostfile(args.hostfile)
 
     # respect CUDA_VISIBLE_DEVICES for a single node and no explicit resource filters
@@ -336,7 +406,7 @@ def main(args=None):
     multi_node_exec = True
     if not resource_pool:
         resource_pool = {}
-        device_count = torch.cuda.device_count()
+        device_count = get_accelerator().device_count()
         if device_count == 0:
             raise RuntimeError("Unable to proceed, no GPU resources available")
         resource_pool['localhost'] = device_count
@@ -352,7 +422,7 @@ def main(args=None):
     env = os.environ.copy()
 
     # validate that passwordless-ssh is workly properly with this hostfile
-    if multi_node_exec:
+    if multi_node_exec and not args.no_ssh_check:
         first_host = list(active_resources.keys())[0]
         try:
             subprocess.check_call(
@@ -369,8 +439,18 @@ def main(args=None):
         assert multi_node_exec
         first_host = list(active_resources.keys())[0]
         hostname_cmd = [f"ssh {first_host} hostname -I"]
-        result = subprocess.check_output(hostname_cmd, shell=True)
+        try:
+            result = subprocess.check_output(hostname_cmd, shell=True)
+        except subprocess.CalledProcessError as err:
+            logger.error(
+                "Unable to detect suitable master address via `hostname -I`, please manually specify one via --master_addr"
+            )
+            raise err
         args.master_addr = result.decode('utf-8').split()[0]
+        if not args.master_addr:
+            raise RuntimeError(
+                f"Unable to detect suitable master address via `hostname -I`, please manually specify one via --master_addr"
+            )
         logger.info(f"Using IP address of {args.master_addr} for node {first_host}")
 
     if args.autotuning != "":
@@ -391,6 +471,9 @@ def main(args=None):
             updated_active_resources[hostname] = list(range(args.num_gpus))
         active_resources = updated_active_resources
 
+    if args.elastic_training:
+        assert not args.no_local_rank, "--no_local_rank argument is not supported in Elastic training"
+
     # encode world info as base64 to make it easier to pass via command line
     world_info_base64 = encode_world_info(active_resources)
 
@@ -414,6 +497,13 @@ def main(args=None):
             deepspeed_launch.append("--no_local_rank")
         if args.save_pid:
             deepspeed_launch += ["--save_pid", f"{os.getpid()}"]
+        if args.enable_each_rank_log:
+            deepspeed_launch.append(
+                f"--enable_each_rank_log={args.enable_each_rank_log}")
+        if args.elastic_training:
+            deepspeed_launch.append("--enable_elastic_training")
+            deepspeed_launch.append(f"--max_elastic_nodes={args.max_elastic_nodes}")
+            deepspeed_launch.append(f"--min_elastic_nodes={args.min_elastic_nodes}")
         cmd = deepspeed_launch + [args.user_script] + args.user_args
     else:
         args.launcher = args.launcher.lower()
@@ -421,8 +511,12 @@ def main(args=None):
             runner = PDSHRunner(args, world_info_base64)
         elif args.launcher == OPENMPI_LAUNCHER:
             runner = OpenMPIRunner(args, world_info_base64, resource_pool)
+        elif args.launcher == MPICH_LAUNCHER:
+            runner = MPICHRunner(args, world_info_base64, resource_pool)
         elif args.launcher == MVAPICH_LAUNCHER:
             runner = MVAPICHRunner(args, world_info_base64, resource_pool)
+        elif args.launcher == SLURM_LAUNCHER:
+            runner = SlurmRunner(args, world_info_base64, resource_pool)
         else:
             raise NotImplementedError(f"Unknown launcher {args.launcher}")
 
@@ -448,11 +542,26 @@ def main(args=None):
                         key, val = var.split('=', maxsplit=1)
                         runner.add_export(key, val)
 
-        cmd = runner.get_cmd(env, active_resources)
+        if args.launcher == PDSH_LAUNCHER:
+            cmd, kill_cmd = runner.get_cmd(env, active_resources)
+        else:
+            cmd = runner.get_cmd(env, active_resources)
 
     logger.info(f"cmd = {' '.join(cmd)}")
     result = subprocess.Popen(cmd, env=env)
 
+    def sigkill_handler(signum, frame):
+        result.send_signal(signal.SIGINT)
+        time.sleep(0.1)
+        result.send_signal(signal.SIGTERM)
+        result_kill = subprocess.Popen(kill_cmd, env=env)
+        result_kill.wait()
+        time.sleep(1)
+        sys.exit(1)
+
+    if args.launcher == PDSH_LAUNCHER:
+        signal.signal(signal.SIGINT, sigkill_handler)
+
     result.wait()
 
     # In case of failure must propagate the error-condition back to the caller (usually shell). The
diff --git a/deepspeed/model_implementations/__init__.py b/deepspeed/model_implementations/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..77d1c5b1e7cb7245c2bd7f55f2cc7548996e276e
--- /dev/null
+++ b/deepspeed/model_implementations/__init__.py
@@ -0,0 +1,4 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .transformers.ds_transformer import DeepSpeedTransformerInference
+from .transformers.clip_encoder import DSClipEncoder
diff --git a/deepspeed/model_implementations/diffusers/__init__.py b/deepspeed/model_implementations/diffusers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcb45ab2b68516814a4bfbffebf2e01cbfefd527
--- /dev/null
+++ b/deepspeed/model_implementations/diffusers/__init__.py
@@ -0,0 +1 @@
+'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/model_implementations/diffusers/unet.py b/deepspeed/model_implementations/diffusers/unet.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ac97945ceb2ffd0eb456172efb07c4b886af876
--- /dev/null
+++ b/deepspeed/model_implementations/diffusers/unet.py
@@ -0,0 +1,63 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+import torch
+from ..features.cuda_graph import CUDAGraph
+
+
+class DSUNet(CUDAGraph, torch.nn.Module):
+    def __init__(self, unet, enable_cuda_graph=True):
+        super().__init__(enable_cuda_graph=enable_cuda_graph)
+        self.unet = unet
+        # SD pipeline accesses this attribute
+        self.in_channels = unet.in_channels
+        self.device = self.unet.device
+        self.dtype = self.unet.dtype
+        self.config = self.unet.config
+        self.fwd_count = 0
+        self.unet.requires_grad_(requires_grad=False)
+        self.unet.to(memory_format=torch.channels_last)
+        self.cuda_graph_created = False
+
+    def _graph_replay(self, *inputs, **kwargs):
+        for i in range(len(inputs)):
+            if torch.is_tensor(inputs[i]):
+                self.static_inputs[i].copy_(inputs[i])
+        for k in kwargs:
+            if torch.is_tensor(kwargs[k]):
+                self.static_kwargs[k].copy_(kwargs[k])
+        self._cuda_graphs.replay()
+        return self.static_output
+
+    def forward(self, *inputs, **kwargs):
+        if self.enable_cuda_graph:
+            if self.cuda_graph_created:
+                outputs = self._graph_replay(*inputs, **kwargs)
+            else:
+                self._create_cuda_graph(*inputs, **kwargs)
+                outputs = self._graph_replay(*inputs, **kwargs)
+            return outputs
+        else:
+            return self._forward(*inputs, **kwargs)
+
+    def _create_cuda_graph(self, *inputs, **kwargs):
+        # warmup to create the workspace and cublas handle
+        cuda_stream = torch.cuda.Stream()
+        cuda_stream.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(cuda_stream):
+            for i in range(3):
+                ret = self._forward(*inputs, **kwargs)
+        torch.cuda.current_stream().wait_stream(cuda_stream)
+
+        # create cuda_graph and assign static_inputs and static_outputs
+        self._cuda_graphs = torch.cuda.CUDAGraph()
+        self.static_inputs = inputs
+        self.static_kwargs = kwargs
+
+        with torch.cuda.graph(self._cuda_graphs):
+            self.static_output = self._forward(*self.static_inputs, **self.static_kwargs)
+
+        self.cuda_graph_created = True
+
+    def _forward(self, sample, timestamp, encoder_hidden_states, return_dict=True):
+        return self.unet(sample, timestamp, encoder_hidden_states, return_dict)
diff --git a/deepspeed/model_implementations/diffusers/vae.py b/deepspeed/model_implementations/diffusers/vae.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f8d13cb2df2e73f1ab044755079225857fb5e52
--- /dev/null
+++ b/deepspeed/model_implementations/diffusers/vae.py
@@ -0,0 +1,148 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+import torch
+from ..features.cuda_graph import CUDAGraph
+
+
+class DSVAE(CUDAGraph, torch.nn.Module):
+    def __init__(self, vae, enable_cuda_graph=True):
+        super().__init__(enable_cuda_graph=enable_cuda_graph)
+        self.vae = vae
+        self.device = self.vae.device
+        self.dtype = self.vae.dtype
+        self.vae.requires_grad_(requires_grad=False)
+        self.decoder_cuda_graph_created = False
+        self.encoder_cuda_graph_created = False
+        self.all_cuda_graph_created = False
+
+    def _graph_replay_decoder(self, *inputs, **kwargs):
+        for i in range(len(inputs)):
+            if torch.is_tensor(inputs[i]):
+                self.static_decoder_inputs[i].copy_(inputs[i])
+        for k in kwargs:
+            if torch.is_tensor(kwargs[k]):
+                self.static_decoder_kwargs[k].copy_(kwargs[k])
+        self._decoder_cuda_graph.replay()
+        return self.static_decoder_output
+
+    def _decode(self, x, return_dict=True):
+        return self.vae.decode(x, return_dict=return_dict)
+
+    def _create_cuda_graph_decoder(self, *inputs, **kwargs):
+        # warmup to create the workspace and cublas handle
+        cuda_stream = torch.cuda.Stream()
+        cuda_stream.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(cuda_stream):
+            for i in range(3):
+                ret = self._decode(*inputs, **kwargs)
+        torch.cuda.current_stream().wait_stream(cuda_stream)
+
+        # create cuda_graph and assign static_inputs and static_outputs
+        self._decoder_cuda_graph = torch.cuda.CUDAGraph()
+        self.static_decoder_inputs = inputs
+        self.static_decoder_kwargs = kwargs
+
+        with torch.cuda.graph(self._decoder_cuda_graph):
+            self.static_decoder_output = self._decode(*self.static_decoder_inputs,
+                                                      **self.static_decoder_kwargs)
+
+        self.decoder_cuda_graph_created = True
+
+    def decode(self, *inputs, **kwargs):
+        if self.enable_cuda_graph:
+            if self.decoder_cuda_graph_created:
+                outputs = self._graph_replay_decoder(*inputs, **kwargs)
+            else:
+                self._create_cuda_graph_decoder(*inputs, **kwargs)
+                outputs = self._graph_replay_decoder(*inputs, **kwargs)
+            return outputs
+        else:
+            return self._decode(*inputs, **kwargs)
+
+    def _graph_replay_encoder(self, *inputs, **kwargs):
+        for i in range(len(inputs)):
+            if torch.is_tensor(inputs[i]):
+                self.static_encoder_inputs[i].copy_(inputs[i])
+        for k in kwargs:
+            if torch.is_tensor(kwargs[k]):
+                self.static_encoder_kwargs[k].copy_(kwargs[k])
+        self._encoder_cuda_graph.replay()
+        return self.static_encoder_output
+
+    def _encode(self, x, return_dict=True):
+        return self.vae.encode(x, return_dict=return_dict)
+
+    def _create_cuda_graph_encoder(self, *inputs, **kwargs):
+        # warmup to create the workspace and cublas handle
+        cuda_stream = torch.cuda.Stream()
+        cuda_stream.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(cuda_stream):
+            for i in range(3):
+                ret = self._encode(*inputs, **kwargs)
+        torch.cuda.current_stream().wait_stream(cuda_stream)
+
+        # create cuda_graph and assign static_inputs and static_outputs
+        self._encoder_cuda_graph = torch.cuda.CUDAGraph()
+        self.static_encoder_inputs = inputs
+        self.static_encoder_kwargs = kwargs
+
+        with torch.cuda.graph(self._encoder_cuda_graph):
+            self.static_encoder_output = self._encode(*self.static_encoder_inputs,
+                                                      **self.static_encoder_kwargs)
+
+        self.encoder_cuda_graph_created = True
+
+    def encode(self, *inputs, **kwargs):
+        if self.enable_cuda_graph:
+            if self.encoder_cuda_graph_created:
+                outputs = self._graph_replay_encoder(*inputs, **kwargs)
+            else:
+                self._create_cuda_graph_encoder(*inputs, **kwargs)
+                outputs = self._graph_replay_encoder(*inputs, **kwargs)
+            return outputs
+        else:
+            return self._encode(*inputs, **kwargs)
+
+    def _graph_replay(self, *inputs, **kwargs):
+        for i in range(len(inputs)):
+            if torch.is_tensor(inputs[i]):
+                self.static_inputs[i].copy_(inputs[i])
+        for k in kwargs:
+            if torch.is_tensor(kwargs[k]):
+                self.static_kwargs[k].copy_(kwargs[k])
+        self._all_cuda_graph.replay()
+        return self.static_output
+
+    def forward(self, *inputs, **kwargs):
+        if self.enable_cuda_graph:
+            if self.cuda_graph_created:
+                outputs = self._graph_replay(*inputs, **kwargs)
+            else:
+                self._create_cuda_graph(*inputs, **kwargs)
+                outputs = self._graph_replay(*inputs, **kwargs)
+            return outputs
+        else:
+            return self._forward(*inputs, **kwargs)
+
+    def _create_cuda_graph(self, *inputs, **kwargs):
+        # warmup to create the workspace and cublas handle
+        cuda_stream = torch.cuda.Stream()
+        cuda_stream.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(cuda_stream):
+            for i in range(3):
+                ret = self._forward(*inputs, **kwargs)
+        torch.cuda.current_stream().wait_stream(cuda_stream)
+
+        # create cuda_graph and assign static_inputs and static_outputs
+        self._all_cuda_graph = torch.cuda.CUDAGraph()
+        self.static_inputs = inputs
+        self.static_kwargs = kwargs
+
+        with torch.cuda.graph(self._all_cuda_graph):
+            self.static_output = self._forward(*self.static_inputs, **self.static_kwargs)
+
+        self.all_cuda_graph_created = True
+
+    def _forward(self, sample, timestamp, encoder_hidden_states, return_dict=True):
+        return self.vae(sample, timestamp, encoder_hidden_states, return_dict)
diff --git a/deepspeed/model_implementations/features/__init__.py b/deepspeed/model_implementations/features/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcb45ab2b68516814a4bfbffebf2e01cbfefd527
--- /dev/null
+++ b/deepspeed/model_implementations/features/__init__.py
@@ -0,0 +1 @@
+'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/model_implementations/features/cuda_graph.py b/deepspeed/model_implementations/features/cuda_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..3224f625c6bc73013170446636e5ed3fe0567e71
--- /dev/null
+++ b/deepspeed/model_implementations/features/cuda_graph.py
@@ -0,0 +1,24 @@
+'''
+Copyright 2023 The Microsoft DeepSpeed Team
+'''
+from abc import ABC, abstractmethod
+
+
+class CUDAGraph(ABC):
+    def __init__(self, enable_cuda_graph=False):
+        super().__init__()
+        self.enable_cuda_graph = enable_cuda_graph
+
+    @abstractmethod
+    def _create_cuda_graph(self):
+        """
+        Create CUDA graph(s)
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def _graph_replay(self):
+        """
+        Replay CUDA graph(s)
+        """
+        raise NotImplementedError
diff --git a/deepspeed/model_implementations/transformers/__init__.py b/deepspeed/model_implementations/transformers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcb45ab2b68516814a4bfbffebf2e01cbfefd527
--- /dev/null
+++ b/deepspeed/model_implementations/transformers/__init__.py
@@ -0,0 +1 @@
+'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/model_implementations/transformers/clip_encoder.py b/deepspeed/model_implementations/transformers/clip_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..efa282c4449f4225b563e3650bebf4e19df4c8f5
--- /dev/null
+++ b/deepspeed/model_implementations/transformers/clip_encoder.py
@@ -0,0 +1,79 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+import torch
+from deepspeed.accelerator import get_accelerator
+from ..features.cuda_graph import CUDAGraph
+
+
+class DSClipEncoder(CUDAGraph, torch.nn.Module):
+    def __init__(self, enc, enable_cuda_graph=False):
+        super().__init__(enable_cuda_graph=enable_cuda_graph)
+        enc.text_model._build_causal_attention_mask = self._build_causal_attention_mask
+        self.enc = enc
+        self.device = self.enc.device
+        self.dtype = self.enc.dtype
+        self.cuda_graph_created = [False, False]
+        self.static_inputs = [None, None]
+        self.static_kwargs = [None, None]
+        self.static_output = [None, None]
+        self._cuda_graphs = [None, None]
+        self.iter = 0
+        self.config = self.enc.config
+
+    def _build_causal_attention_mask(self, bsz, seq_len, dtype):
+        mask = torch.empty(bsz,
+                           seq_len,
+                           seq_len,
+                           dtype=dtype,
+                           device=get_accelerator().current_device_name())
+        mask.fill_(torch.tensor(torch.finfo(dtype).min))
+        mask.triu_(1)
+        mask = mask.unsqueeze(1)
+        return mask
+
+    def _graph_replay(self, *inputs, **kwargs):
+        for i in range(len(inputs)):
+            if torch.is_tensor(inputs[i]):
+                self.static_inputs[self.iter][i].copy_(inputs[i])
+        for k in kwargs:
+            if torch.is_tensor(kwargs[k]):
+                self.static_kwargs[self.iter][k].copy_(kwargs[k])
+        self._cuda_graphs[self.iter].replay()
+        return self.static_output[self.iter]
+
+    def forward(self, *inputs, **kwargs):
+        if self.enable_cuda_graph:
+            if self.cuda_graph_created[self.iter]:
+                outputs = self._graph_replay(*inputs, **kwargs)
+            else:
+                self._create_cuda_graph(*inputs, **kwargs)
+                outputs = self._graph_replay(*inputs, **kwargs)
+            self.iter = (self.iter + 1) % 2
+            return outputs
+        else:
+            return self.enc(*inputs, **kwargs)
+
+    def _create_cuda_graph(self, *inputs, **kwargs):
+        # warmup to create the workspace and cublas handle
+        cuda_stream = torch.cuda.Stream()
+        cuda_stream.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(cuda_stream):
+            for i in range(3):
+                ret = self._forward(*inputs, **kwargs)
+        torch.cuda.current_stream().wait_stream(cuda_stream)
+
+        # create cuda_graph and assign static_inputs and static_outputs
+        self._cuda_graphs[self.iter] = torch.cuda.CUDAGraph()
+        self.static_inputs[self.iter] = inputs
+        self.static_kwargs[self.iter] = kwargs
+
+        with torch.cuda.graph(self._cuda_graphs[self.iter]):
+            self.static_output[self.iter] = self._forward(
+                *self.static_inputs[self.iter],
+                **self.static_kwargs[self.iter])
+
+        self.cuda_graph_created[self.iter] = True
+
+    def _forward(self, *inputs, **kwargs):
+        return self.enc(*inputs, **kwargs)
diff --git a/deepspeed/model_implementations/transformers/ds_base.py b/deepspeed/model_implementations/transformers/ds_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a848eaae1ad18f42d4039331fc4ae12e3380163
--- /dev/null
+++ b/deepspeed/model_implementations/transformers/ds_base.py
@@ -0,0 +1,11 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch.nn as nn
+
+
+class DeepSpeedTransformerBase(nn.module):
+    def __init__(self):
+        pass
+
+    # this would be the new clean base class that will replace DeepSpeedTransformerInference.
+    # we currently don't know how this will look like but keeping it here as a placeholder.
diff --git a/deepspeed/model_implementations/transformers/ds_bert.py b/deepspeed/model_implementations/transformers/ds_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e7a7b77fddf77ba0f41bf36f968746bb5c8749d
--- /dev/null
+++ b/deepspeed/model_implementations/transformers/ds_bert.py
@@ -0,0 +1,23 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
+
+
+class DeepSpeedBERTInference(DeepSpeedTransformerInference):
+    """Initialize the DeepSpeed BERT Transformer Layer.
+    """
+    def __init__(self,
+                 config,
+                 mp_group=None,
+                 quantize_scales=None,
+                 quantize_groups=1,
+                 merge_count=1,
+                 mlp_extra_grouping=False):
+        super().__init__(config,
+                         mp_group,
+                         quantize_scales,
+                         quantize_groups,
+                         merge_count,
+                         mlp_extra_grouping)
diff --git a/deepspeed/model_implementations/transformers/ds_bloom.py b/deepspeed/model_implementations/transformers/ds_bloom.py
new file mode 100644
index 0000000000000000000000000000000000000000..386352f28a76de5020b7c1b111dae8e998ae1094
--- /dev/null
+++ b/deepspeed/model_implementations/transformers/ds_bloom.py
@@ -0,0 +1,23 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
+
+
+class DeepSpeedBloomInference(DeepSpeedTransformerInference):
+    """Initialize the DeepSpeed Bloom Transformer Layer.
+    """
+    def __init__(self,
+                 config,
+                 mp_group=None,
+                 quantize_scales=None,
+                 quantize_groups=1,
+                 merge_count=1,
+                 mlp_extra_grouping=False):
+        super().__init__(config,
+                         mp_group,
+                         quantize_scales,
+                         quantize_groups,
+                         merge_count,
+                         mlp_extra_grouping)
diff --git a/deepspeed/model_implementations/transformers/ds_gpt.py b/deepspeed/model_implementations/transformers/ds_gpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..86cc9fdc6715f776551888a6b94cf918b1681406
--- /dev/null
+++ b/deepspeed/model_implementations/transformers/ds_gpt.py
@@ -0,0 +1,23 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
+
+
+class DeepSpeedGPTInference(DeepSpeedTransformerInference):
+    """Initialize the DeepSpeed GPT Transformer Layer.
+    """
+    def __init__(self,
+                 config,
+                 mp_group=None,
+                 quantize_scales=None,
+                 quantize_groups=1,
+                 merge_count=1,
+                 mlp_extra_grouping=False):
+        super().__init__(config,
+                         mp_group,
+                         quantize_scales,
+                         quantize_groups,
+                         merge_count,
+                         mlp_extra_grouping)
diff --git a/deepspeed/model_implementations/transformers/ds_megatron_gpt.py b/deepspeed/model_implementations/transformers/ds_megatron_gpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..aca6b809e5df12f03e2b3a83568611ed277a331d
--- /dev/null
+++ b/deepspeed/model_implementations/transformers/ds_megatron_gpt.py
@@ -0,0 +1,23 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
+
+
+class DeepSpeedMegatronGPTInference(DeepSpeedTransformerInference):
+    """Initialize the DeepSpeed Megatron GPT Transformer Layer.
+    """
+    def __init__(self,
+                 config,
+                 mp_group=None,
+                 quantize_scales=None,
+                 quantize_groups=1,
+                 merge_count=1,
+                 mlp_extra_grouping=False):
+        super().__init__(config,
+                         mp_group,
+                         quantize_scales,
+                         quantize_groups,
+                         merge_count,
+                         mlp_extra_grouping)
diff --git a/deepspeed/model_implementations/transformers/ds_opt.py b/deepspeed/model_implementations/transformers/ds_opt.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5209a30f818bfab912568872070bba046b96d09
--- /dev/null
+++ b/deepspeed/model_implementations/transformers/ds_opt.py
@@ -0,0 +1,23 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
+
+
+class DeepSpeedOPTInference(DeepSpeedTransformerInference):
+    """Initialize the DeepSpeed OPT Transformer Layer.
+    """
+    def __init__(self,
+                 config,
+                 mp_group=None,
+                 quantize_scales=None,
+                 quantize_groups=1,
+                 merge_count=1,
+                 mlp_extra_grouping=False):
+        super().__init__(config,
+                         mp_group,
+                         quantize_scales,
+                         quantize_groups,
+                         merge_count,
+                         mlp_extra_grouping)
diff --git a/deepspeed/model_implementations/transformers/ds_transformer.py b/deepspeed/model_implementations/transformers/ds_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee5a9bdf8763a743b8f233c0c48e7e6befade499
--- /dev/null
+++ b/deepspeed/model_implementations/transformers/ds_transformer.py
@@ -0,0 +1,188 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+import torch
+import torch.nn as nn
+from deepspeed import comm as dist
+from deepspeed.utils.logging import log_dist
+
+from deepspeed.ops.transformer.inference.ds_mlp import DeepSpeedMLP
+from deepspeed.ops.transformer.inference.ds_attention import DeepSpeedSelfAttention, BloomSelfAttention
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import InferenceBuilder
+
+inference_cuda_module = None
+
+
+class DeepSpeedTransformerInference(nn.Module):
+    """Initialize the DeepSpeed Transformer Layer.
+        Arguments:
+            layer_id: The layer index starting from 0, e.g. if model has 24 transformer layers,
+                layer_id will be 0,1,2...23 when each layer object is instantiated
+            config: An object of DeepSpeedInferenceConfig
+            mp_group: Model parallelism group initialized on the modeling side.
+            quantize_scales: This argument groups all the layers' scales used for quantization
+            quantize_groups: Number of groups used for quantizing the model
+            merge_count: Shows the number of model-parallel checkpoints merged before running inference.
+                We use this argument to control the quantization scale for the model parameters if a bigger
+                quantize-grouping than 1 is used.
+            mlp_extra_grouping: This flag is used to show a 2x higher number of groups used for the MLP part
+                of a Transformer layer. We use this feature for quantization to reduce the convergence impact
+                for specific downstream tasks.
+    """
+    layer_id = 0
+
+    def __init__(self,
+                 config,
+                 mp_group=None,
+                 quantize_scales=None,
+                 quantize_groups=1,
+                 merge_count=1,
+                 mlp_extra_grouping=False):
+        super(DeepSpeedTransformerInference, self).__init__()
+
+        self.config = config
+        self.config.layer_id = DeepSpeedTransformerInference.layer_id
+        DeepSpeedTransformerInference.layer_id += 1
+
+        data_type = torch.half if config.fp16 else torch.float
+        global inference_cuda_module
+        if inference_cuda_module is None:
+            builder = InferenceBuilder()
+            inference_cuda_module = builder.load()
+
+        if DeepSpeedTransformerInference.layer_id == 1:
+            log_dist(f"DeepSpeed-Inference config: {self.config.__dict__}", [0])
+
+        if self.config.bigscience_bloom:
+            self.attention = BloomSelfAttention(self.config,
+                                                mp_group,
+                                                quantize_scales,
+                                                quantize_groups,
+                                                merge_count)
+        else:
+            self.attention = DeepSpeedSelfAttention(self.config,
+                                                    mp_group,
+                                                    quantize_scales,
+                                                    quantize_groups,
+                                                    merge_count)
+        self.mlp = DeepSpeedMLP(self.config,
+                                mp_group,
+                                quantize_scales,
+                                quantize_groups,
+                                merge_count,
+                                mlp_extra_grouping)
+
+        device = get_accelerator().current_device_name(
+        )  # if config.bigscience_bloom else 'cpu'
+        self.norm_w = nn.Parameter(torch.empty(self.config.hidden_size,
+                                               dtype=data_type,
+                                               device=device),
+                                   requires_grad=False)
+        self.norm_b = nn.Parameter(torch.empty(self.config.hidden_size,
+                                               dtype=data_type,
+                                               device=device),
+                                   requires_grad=False)
+        self.layer_past = None
+        self.allocate_workspace = inference_cuda_module.allocate_workspace_fp32 if (not config.fp16) else \
+                                inference_cuda_module.allocate_workspace_fp16
+
+    @classmethod
+    def reset_cache(cls):
+        if inference_cuda_module is not None:
+            inference_cuda_module.reset_cache()
+
+    def forward(
+            self,
+            input=None,
+            input_mask=None,
+            attention_mask=None,
+            attn_mask=None,
+            head_mask=None,
+            layer_past=None,
+            get_key_value=False,
+            get_present=False,
+            encoder_output=None,
+            enc_dec_attn_mask=None,
+            x=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            use_cache=False,
+            alibi=None,
+            output_attentions=False,
+            # TODO(arashb): 'layer_head_mask' and 'past_key_value' are only added to satisfy the OPT models API.
+            # This needs to be redesigned later!
+            layer_head_mask=None,
+            past_key_value=None):
+
+        if x is not None:
+            input = x
+
+        input_mask = (input_mask if attn_mask is None else
+                      attn_mask) if attention_mask is None else attention_mask
+
+        # Allocate memory only on first layer forward
+        if self.config.layer_id == 0:
+            self.allocate_workspace(self.config.hidden_size,
+                                    self.config.heads,
+                                    input.size()[1],
+                                    input.size()[0],
+                                    DeepSpeedTransformerInference.layer_id,
+                                    self.config.mp_size,
+                                    self.config.bigscience_bloom,
+                                    dist.get_rank() if dist.is_initialized() else 0,
+                                    self.config.max_out_tokens)
+
+        get_present = (get_present or get_key_value or use_cache)
+        input_mask = input_mask if attention_mask is None else attention_mask
+
+        # We set the prev key/value to None when there is a prompt
+        if input.shape[1] > 1:
+            self.layer_past = None
+        layer_past = layer_past if layer_past is not None else self.layer_past
+        head_mask = layer_head_mask if layer_head_mask is not None else head_mask
+
+        attn_mask = None
+        if isinstance(input, tuple):
+            attn_mask = input[1]
+            input = input[0]
+        input_type = input.dtype
+
+        if (self.config.fp16 or self.config.q_int8) \
+            and input.dtype == torch.float:
+            input = input.half()
+        with torch.no_grad():
+            attention_output, key, value, context_outputtn_ctx, inp_norm = \
+                                     self.attention(input,
+                                              input_mask,
+                                              head_mask,
+                                              layer_past,
+                                              get_present,
+                                              encoder_hidden_states,
+                                              encoder_attention_mask,
+                                              output_attentions,
+                                              self.norm_w,
+                                              self.norm_b,
+                                              alibi)
+
+            presents = (key, value)
+            self.layer_past = presents if layer_past is None else None
+            output = self.mlp(attention_output, input, inp_norm, self.attention.attn_ob)
+
+            if not self.config.pre_layer_norm:
+                output = inference_cuda_module.layer_norm(output,
+                                                          self.norm_w,
+                                                          self.norm_b,
+                                                          self.config.epsilon)
+
+            output = output.to(input_type)
+        if get_present:
+            output = (output, presents)
+
+        if self.config.return_single_tuple:
+            return (output, )
+        elif self.config.return_tuple:
+            return output if type(output) is tuple else (output, attn_mask)
+        else:
+            return output
diff --git a/deepspeed/module_inject/__init__.py b/deepspeed/module_inject/__init__.py
old mode 100644
new mode 100755
index 7fe1a3b36b2ed4b286b0a2c351b759cf5130a537..aab302864051d55db567dfd5868dbe8874d841ee
--- a/deepspeed/module_inject/__init__.py
+++ b/deepspeed/module_inject/__init__.py
@@ -1,3 +1,7 @@
-from .replace_module import replace_transformer_layer, revert_transformer_layer
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .replace_module import replace_transformer_layer, revert_transformer_layer, ReplaceWithTensorSlicing, GroupQuantizer, generic_injection
 from .module_quantize import quantize_transformer_layer
-from .replace_policy import DSPolicy, HFBertLayerPolicy
+from .replace_policy import HFBertLayerPolicy
+from .layers import LinearAllreduce, LinearLayer, EmbeddingLayer, Normalize
+from .policy import DSPolicy
diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2c570f5781c724bca1a92a8c17a3744aad7f718
--- /dev/null
+++ b/deepspeed/module_inject/auto_tp.py
@@ -0,0 +1,124 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+# Automatic Tensor Parallelism
+import re
+
+from torch import nn
+from .replace_policy import replace_policies
+
+
+class AutoTP():
+    def in_module_list(module, module_list):
+        for item in module_list:
+            if type(item).__name__ == type(module).__name__:
+                return True
+        return False
+
+    def get_module_list(model):
+        mlist = []
+        for child in model.children():
+            if isinstance(child, nn.ModuleList):
+                for module in child.children():
+                    if not mlist:
+                        mlist = [module]
+                    elif not AutoTP.in_module_list(module, mlist):
+                        mlist = mlist + [module]
+            else:
+                mlist = mlist + AutoTP.get_module_list(child)
+        return mlist
+
+    def supported(model):
+        unsupported = [
+            'bloom',
+            'codegen',
+            'deberta',
+            'flaubert',
+            'fsmt',
+            'gpt2',
+            'led',
+            'longformer',
+            'xlm',
+            'xlnet'
+        ]
+        model = str(model)
+        key = re.search(r": (.*?)Model", model)
+        if key is None:
+            key = re.search(r": (.*?)Stack", model)
+        if key is None:
+            key = re.match(r"(.*?)Model", model)
+        assert key is not None, "Not able to determine model policy automatically. Please provide policy."
+        if key.group(1).lower() in unsupported:
+            return False
+        return True
+
+    def get_layers(parent, module):
+        layer_list = []
+        for key, submodule in module._modules.items():
+            if isinstance(submodule, nn.Linear):
+                layer_list = layer_list + [parent + "." + key]
+            elif isinstance(submodule,
+                            nn.LayerNorm) or key == 'LayerNorm' or key == 'layer_norm':
+                layer_list = layer_list + ["ln"]
+            else:
+                layer_list = layer_list + AutoTP.get_layers(key, submodule)
+        return layer_list
+
+    def update_policy_list(policy_list, new_module, new_gems):
+        if len(policy_list):
+            for i, policy in enumerate(policy_list):
+                # if module already exists in policy, combine gems and remove duplicates
+                if policy[0] == type(new_module):
+                    new_gems = set(new_gems + policy[1])
+                    policy_list[i] = tuple([type(new_module), new_gems])
+                    return policy_list
+        policy_list.append(tuple([type(new_module), new_gems]))
+        return policy_list
+
+    def kernel_supported(module_list):
+        policy = []
+        for plcy in replace_policies:
+            # instantiate a throw-away policy in order to populate the _orig_layer_class
+            _ = plcy(None)
+            if isinstance(plcy._orig_layer_class, list):
+                for orig_layer_class in plcy._orig_layer_class:
+                    policy.append(orig_layer_class)
+            elif plcy._orig_layer_class is not None:
+                policy.append(plcy._orig_layer_class)
+        for child in module_list:
+            if child.__class__ in policy:
+                return True
+        return False
+
+    def tp_parser(model):
+        policy_list = []
+        module_list = []
+        layer_list = []
+        gem_list = []
+
+        module_list = AutoTP.get_module_list(model)
+        assert AutoTP.supported(model), "AutoTP not supported for model. Please use kernel injection since container policy for model exists." \
+        if AutoTP.kernel_supported(module_list) else "AutoTP not supported for model. Please provide policy."
+        for module in module_list:
+            for key, submodule in module._modules.items():
+                if isinstance(submodule, nn.Linear):
+                    layer_list = layer_list + ["." + key]
+                elif isinstance(
+                        submodule,
+                        nn.LayerNorm) or key == 'LayerNorm' or key == 'layer_norm':
+                    layer_list = layer_list + ["ln"]
+                else:
+                    layer_list = layer_list + AutoTP.get_layers(key, submodule)
+            for i, layer in enumerate(layer_list):
+                if layer == 'ln':
+                    if layer_list[i - 1] != 'ln':
+                        gem_list = gem_list + [layer_list[i - 1]]
+                elif 'out_proj' in layer:
+                    gem_list = gem_list + [layer]
+            layer_list = []
+            if gem_list != []:
+                gem_list = list(set(gem_list))
+                policy_list = AutoTP.update_policy_list(policy_list, module, gem_list)
+                gem_list = []
+        assert len(policy_list), "AutoTP not supported for model. Please use kernel injection since container policy for model exists." \
+        if AutoTP.kernel_supported(module_list) else "Not able to determine model policy automatically. Please provide policy."
+        return policy_list
diff --git a/deepspeed/module_inject/containers/__init__.py b/deepspeed/module_inject/containers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6abef517a500d477f3d049606e4365c0b3f4eda1
--- /dev/null
+++ b/deepspeed/module_inject/containers/__init__.py
@@ -0,0 +1,15 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .bert import DS_BERTContainer, HFBertLayerPolicy
+from .bloom import DS_BloomContainer, BLOOMLayerPolicy, supported_models
+from .distil_bert import DS_DistilBERTContainer, HFDistilBertLayerPolicy
+from .gpt2 import DS_GPT2Container, HFGPT2LayerPolicy
+from .gptj import DS_GPTJContainer, HFGPTJLayerPolicy
+from .gptneo import DS_GPTNEOContainer, HFGPTNEOLayerPolicy
+from .gptneox import DS_GPTNEOXContainer, GPTNEOXLayerPolicy
+from .megatron_gpt import DS_MegatronGPTContainer, MegatronLayerPolicy
+from .megatron_gpt_moe import DS_MegatronGPTMoEContainer, MegatronMoELayerPolicy
+from .opt import DS_OPTContainer, HFOPTLayerPolicy
+from .clip import DS_CLIPContainer, HFCLIPLayerPolicy
+from .unet import UNetPolicy
+from .vae import VAEPolicy
diff --git a/deepspeed/module_inject/containers/base.py b/deepspeed/module_inject/containers/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..45faeb47795199bf1378b9bef918ac66f7b6a6ec
--- /dev/null
+++ b/deepspeed/module_inject/containers/base.py
@@ -0,0 +1,248 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+# Create a container object to save model-specific tensors using the policy file above.
+from abc import ABC
+import torch
+
+from deepspeed.ops.transformer.inference.config import DeepSpeedInferenceConfig
+from deepspeed.accelerator import get_accelerator
+
+
+class BaseConvolutionContainer(ABC):
+    # not implemented
+    def __init__(self):
+        pass
+
+
+class BaseTransformerContainer(ABC):
+    def __init__(self, policy, config, model_config, layer_id, child):
+        self.policy = policy
+        self.config = config
+        self.model_config = model_config
+        self.layer_id = layer_id
+        self.child = child
+
+        self.megatron_v2 = self.policy.is_megatron_v2
+        self.scale_attention = self.policy.scale_attention
+        self.ckpt_load_enabled = False
+
+        # configuration for models. todo: can this be moved to a pydantic model config?
+        self.hidden_size = None
+        self.num_attention_heads = None
+        self.mp_size = self.config.tensor_parallel.tp_size
+        self.pre_layer_norm = self.policy.pre_attn_norm
+        self.fp16 = False
+        self.attn_linear_layer = self.policy.linear_layer
+        self.mlp_linear_layer = self.policy.linear_layer
+        self.layer_norm_eps = self.model_config.layer_norm_eps if \
+            hasattr(self.model_config, 'layer_norm_eps') else (self.model_config.layer_norm_epsilon if \
+            hasattr(self.model_config, 'layer_norm_epsilon') else self.model_config.layernorm_epsilon if \
+            hasattr(self.model_config, 'layernorm_epsilon') else 1.0e-12)
+        self.return_tuple = self.config.return_tuple
+        self.triangular_masking = True
+        self.local_attention = ((self.model_config.attention_layers[self.layer_id]
+                                 == "local") if hasattr(self.model_config,
+                                                        'attention_layers') else False)
+        self.window_size = getattr(self.model_config, "window_size", 1)
+        self.mlp_act_func_type = self.policy.mlp_act_func_type
+        self.training_mp_size = self.config.training_mp_size
+        self.bigscience_bloom = False
+        self.max_out_tokens = self.config.max_out_tokens
+        self.scale_attn_by_inverse_layer_idx = getattr(
+            self.config,
+            "scale_attn_by_inverse_layer_idx",
+            False)
+        self.use_mup = self.policy.use_mup
+        self.return_single_tuple = False
+        self.rotary_dim = self.model_config.rotary_dim if hasattr(self.model_config, 'rotary_dim') \
+                          else self.child.attention.rotary_ndims if \
+                          hasattr(self.child, 'attention') and hasattr(self.child.attention,'rotary_ndims') else -1
+        self.mlp_after_attn = (self.rotary_dim is None or self.rotary_dim < 0)
+
+        # Attention tensors
+        self.qkvw = None
+        self.qkvb = None
+        self.dense_w = None
+        self.dense_b = None
+        # MLP tensors
+        self._h4h_w = None
+        self._h4h_b = None
+        self._4hh_w = None
+        self._4hh_b = None
+        # LayerNorm tensors
+        self.attn_nw = None
+        self.attn_nb = None
+        self.input_nw = None
+        self.input_nb = None
+
+    def create_ds_model_config(self):
+        self.set_hidden_heads(*self.policy.get_hidden_heads())
+        assert self.num_attention_heads % self.mp_size == 0,\
+                "To run the model parallel across the GPUs, the attention_heads require to be divisible by the world_size!" +\
+                "This is because the attention computation is partitioned evenly among the parallel GPUs."
+
+        self.ds_model_config = DeepSpeedInferenceConfig(
+            hidden_size=self.hidden_size,
+            heads=self.num_attention_heads,
+            layer_norm_eps=self.layer_norm_eps,
+            fp16=self.fp16,
+            pre_layer_norm=self.pre_layer_norm,
+            mp_size=self.mp_size,
+            q_int8=self.quantize,
+            return_tuple=self.return_tuple,
+            triangular_masking=self.triangular_masking,
+            local_attention=self.local_attention,
+            window_size=self.window_size,
+            rotary_dim=self.rotary_dim,
+            mlp_after_attn=self.mlp_after_attn,
+            mlp_act_func_type=self.mlp_act_func_type,
+            training_mp_size=self.training_mp_size,
+            bigscience_bloom=self.bigscience_bloom,
+            max_out_tokens=self.max_out_tokens,
+            scale_attn_by_inverse_layer_idx=self.scale_attn_by_inverse_layer_idx,
+            use_mup=self.use_mup,
+            return_single_tuple=self.return_single_tuple,
+        )
+
+        return self.ds_model_config
+
+    def initialize_tensors(self):
+        # Set the tensors from policy (user module) to container (DS module)
+        self.set_attention(*self.policy.attention())
+        self.set_mlp(*self.policy.mlp())
+        self.set_layernorm(*self.policy.layernorm())
+
+    def convert_to_required_dtype(self, dtype):
+        # Note: converting tensors to fp16 requires that we do it in-place using self.__dict__ and not make a list/dict copy
+        if dtype == torch.half:
+            for k, v in self.__dict__.items():
+                # The list comprehension is used for MoE tensor lists
+                if isinstance(v, list) and all((isinstance(tensor, torch.Tensor) \
+                   or isinstance(tensor, torch.nn.Parameter)) for tensor in v):
+                    self.__dict__[k] = [moe_tensor.half() for moe_tensor in v]
+
+                if isinstance(v, torch.Tensor) or isinstance(v, torch.nn.Parameter):
+                    self.__dict__[k] = v.half()
+
+    def set_dtype(self, fp16=False):
+        self.fp16 = fp16
+
+    def set_moe(self, moe=False):
+        self.moe = moe
+
+    def set_tensor_parallel_config(self, mp_size, mp_group):
+        self.mp_size = mp_size
+        self.mp_group = mp_group
+
+    def set_quantization_config(self, quantize, quantizer):
+        self.quantize = quantize
+        self.quantizer = quantizer
+
+    def set_hidden_heads(self, hidden_size, num_attention_heads):
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+
+    def set_attention(self, qkvw, qkvb, dense_w, dense_b):
+        self.qkvw = qkvw
+        self.qkvb = qkvb
+        self.dense_w = dense_w
+        self.dense_b = dense_b
+
+    def set_mlp(self, _h4h_w, _h4h_b, _4hh_w, _4hh_b):
+        self._h4h_w = _h4h_w
+        self._h4h_b = _h4h_b
+        self._4hh_w = _4hh_w
+        self._4hh_b = _4hh_b
+
+    def set_layernorm(self, attn_nw, attn_nb, input_nw, input_nb):
+        self.attn_nw = attn_nw
+        self.attn_nb = attn_nb
+        self.input_nw = input_nw
+        self.input_nb = input_nb
+
+    def apply_weight_quantization(self):
+        # quantize attention weights
+        self.attention_quantization()
+
+        # quantize mlp weights
+        self.mlp_quantization()
+
+    def attention_quantization(self):
+        self.module.attention.attn_qkvw = self.quantizer.quantize(
+            self.module.attention.attn_qkvw)
+        self.module.attention.attn_ow = self.quantizer.quantize(
+            self.module.attention.attn_ow)
+
+    def mlp_quantization(self):
+        self.module.mlp.inter_w = self.quantizer.quantize(self.module.mlp.inter_w)
+        self.module.mlp.output_w = self.quantizer.quantize(self.module.mlp.output_w)
+
+    def apply_tensor_parallelism(self, mp_replace):
+        # setup the new Attention module
+        self.attention_qkv_mp(mp_replace)
+        self.attention_o_mp(mp_replace)
+
+        # setup the new MLP module
+        self.mlp_inter_mp(mp_replace)
+        self.mlp_output_mp(mp_replace)
+
+        # Apply weight quantization
+        self.apply_weight_quantization()
+
+    def attention_qkv_mp(self, mp_replace):
+        self.module.attention.attn_qkvw = mp_replace.qkv_copy(
+            self.module.attention.attn_qkvw,
+            self.qkvw)
+        self.module.attention.attn_qkvb = mp_replace.qkv_copy(
+            self.module.attention.attn_qkvb,
+            self.qkvb)
+
+    def attention_o_mp(self, mp_replace):
+        self.module.attention.attn_ow = mp_replace.copy(self.module.attention.attn_ow,
+                                                        self.dense_w)
+        self.module.attention.attn_ob = mp_replace.copy(self.module.attention.attn_ob,
+                                                        self.dense_b)
+
+    def mlp_inter_mp(self, mp_replace):
+        self.module.mlp.inter_w = mp_replace.copy(self.module.mlp.inter_w, self._h4h_w)
+        self.module.mlp.inter_b = mp_replace.copy(self.module.mlp.inter_b, self._h4h_b)
+
+    def mlp_output_mp(self, mp_replace):
+        self.module.mlp.output_w = mp_replace.copy(self.module.mlp.output_w, self._4hh_w)
+        self.module.mlp.output_b = mp_replace.copy(self.module.mlp.output_b, self._4hh_b)
+
+    def copy_data_to_new_module(self):
+        if self.attn_nw is None:
+            self.module.mlp.attn_nw = self.attn_nw
+            self.module.mlp.attn_nb = self.attn_nb
+        else:
+            self.module.mlp.attn_nw.data.copy_(
+                self.attn_nw.to(get_accelerator().current_device_name()))
+            self.module.mlp.attn_nb.data.copy_(
+                self.attn_nb.to(get_accelerator().current_device_name()))
+
+        self.module.norm_w.data.copy_(
+            self.input_nw.to(get_accelerator().current_device_name()))
+        self.module.norm_b.data.copy_(
+            self.input_nb.to(get_accelerator().current_device_name()))
+
+    def transpose(self):
+        self.transpose_attention()
+        self.transpose_mlp()
+
+    def transpose_attention(self):
+        if self.attn_linear_layer:
+            self.qkvw = self.transpose_impl(self.qkvw.data)
+            self.dense_w = self.transpose_impl(self.dense_w.data)
+
+    def transpose_mlp(self):
+        if self.mlp_linear_layer:
+            self._h4h_w = self.transpose_impl(self._h4h_w.data)
+            self._4hh_w = self.transpose_impl(self._4hh_w.data)
+
+    def transpose_impl(self, data):
+        data = data.contiguous()
+        data.reshape(-1).copy_(data.transpose(-1, -2).contiguous().reshape(-1))
+        data = data.reshape(data.shape[-1], data.shape[-2])
+        data.to(get_accelerator().current_device_name())
+        return data
diff --git a/deepspeed/module_inject/containers/base_moe.py b/deepspeed/module_inject/containers/base_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..4139b08d90916709db302b4531f913d3350200aa
--- /dev/null
+++ b/deepspeed/module_inject/containers/base_moe.py
@@ -0,0 +1,141 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+# Create a container object to save model-specific tensors using the policy file above.
+from .base import *
+from deepspeed import comm as dist
+import deepspeed.ops.transformer as transformer_inference
+from deepspeed.accelerator import get_accelerator
+
+
+class BaseTransformerMoEContainer(BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        # Call the init function of the parent class to initialize the tensors and configs from parent class
+        super().__init__(**kwargs)
+
+        self.num_experts = self.policy.get_num_experts()
+        self.ep_world_size = dist.get_world_size()
+        self.local_ep_size = 1 if self.num_experts < self.ep_world_size else self.num_experts // self.ep_world_size
+
+        self.layer_norm_eps = self.config.layer_norm_eps if hasattr(
+            self.config,
+            'layer_norm_eps') else 1e-12,
+
+        # MoE models will have a list of mlp related tensors
+        self._h4h_w = []
+        self._h4h_b = []
+        self._4hh_w = []
+        self._4hh_b = []
+
+        # Residual MoE needs extra parameters
+        self._res_h4h_w = None
+        self._res_h4h_b = None
+        self._res_4hh_w = None
+        self._res_4hh_b = None
+        self._res_coef = None
+
+    def create_ds_model_config(self):
+        self.set_hidden_heads(*self.policy.get_hidden_heads())
+        assert self.num_attention_heads % self.mp_size == 0,\
+                "To run the model parallel across the GPUs, the attention_heads require to be divisible by the world_size!" +\
+                "This is because the attention computation is partitioned evenly among the parallel GPUs."
+
+        self.ds_model_config = transformer_inference.DeepSpeedMoEInferenceConfig(
+            hidden_size=self.hidden_size,
+            heads=self.num_attention_heads,
+            layer_norm_eps=self.layer_norm_eps,
+            fp16=self.fp16,
+            pre_layer_norm=self.pre_layer_norm,
+            mp_size=self.mp_size,
+            q_int8=self.quantize,
+            moe_experts=self.local_ep_size,
+            global_experts=self.num_experts,
+            mlp_type=self.config.moe.type,
+            scale_attn_by_inverse_layer_idx=self.scale_attn_by_inverse_layer_idx,
+        )
+
+        return self.ds_model_config
+
+    def initialize_tensors(self):
+        # Set the tensors from policy (user module) to container (DS module)
+        self.set_attention(*self.policy.attention())
+        self.set_mlp(self.config.moe.type)
+        self.set_layernorm(*self.policy.layernorm())
+
+    def set_mlp(self, config_moe_type):
+        if config_moe_type == 'standard':
+            self._h4h_w, self._h4h_b, \
+            self._4hh_w, self._4hh_b = self.policy.mlp()
+        else:
+            self._h4h_w, self._h4h_b, self._4hh_w, \
+            self._4hh_b, self._res_h4h_w, self._res_h4h_b, \
+            self._res_4hh_w, self._res_4hh_b, \
+            self._res_coef = self.policy.mlp(config_moe_type)
+
+    def transpose(self):
+        self.transpose_attention()
+        self.transpose_mlp()
+
+        if self.config.moe.type == 'residual':
+            self.transpose_residual()
+
+    def transpose_mlp(self):
+        self._h4h_w = [self.transpose_impl(moe_w1.data) for moe_w1 in self._h4h_w]
+        self._4hh_w = [self.transpose_impl(moe_w1.data) for moe_w1 in self._4hh_w]
+
+    def transpose_residual(self):
+        self._res_h4h_w.data = self.transpose_impl(self._res_h4h_w.data)
+        self._res_4hh_w.data = self.transpose_impl(self._res_4hh_w.data)
+        self._res_coef.data = self.transpose_impl(self._res_coef.data)
+
+    def apply_tensor_parallelism(self, mp_replace):
+        # setup the new Attention module
+        self.attention_qkv_mp(mp_replace)
+        self.attention_o_mp(mp_replace)
+
+        # quantize attention weights
+        self.attention_quantization()
+
+        # setup the new MLP module
+        self.mlp_mp()
+
+    def mlp_mp(self):
+        gpu_index = dist.get_rank()
+        for ep_index in range(self.local_ep_size):
+            # mlp inter
+            self.module.mlp[ep_index].inter_w.data = self._h4h_w[
+                gpu_index * self.local_ep_size + ep_index].to(
+                    get_accelerator().current_device_name())
+            self.module.mlp[ep_index].inter_b.data = self._h4h_b[
+                gpu_index * self.local_ep_size + ep_index].to(
+                    get_accelerator().current_device_name())
+
+            # mlp output
+            self.module.mlp[ep_index].output_w.data = self._4hh_w[
+                gpu_index * self.local_ep_size + ep_index].to(
+                    get_accelerator().current_device_name())
+            self.module.mlp[ep_index].output_b.data = self._4hh_b[
+                gpu_index * self.local_ep_size + ep_index].to(
+                    get_accelerator().current_device_name())
+
+    def copy_data_to_new_module(self):
+        self.module.attn_nw.data = self.attn_nw.to(
+            get_accelerator().current_device_name())
+        self.module.attn_nb.data = self.attn_nb.to(
+            get_accelerator().current_device_name())
+
+        self.module.norm_w.data.copy_(
+            self.input_nw.to(get_accelerator().current_device_name()))
+        self.module.norm_b.data.copy_(
+            self.input_nb.to(get_accelerator().current_device_name()))
+
+        if self.config.moe.type == 'residual':
+            self.module.res_mlp.inter_w.data = self._res_h4h_w.to(
+                get_accelerator().current_device_name())
+            self.module.res_mlp.inter_b.data = self._res_h4h_b.to(
+                get_accelerator().current_device_name())
+            self.module.res_mlp.output_w.data = self._res_4hh_w.to(
+                get_accelerator().current_device_name())
+            self.module.res_mlp.output_b.data = self._res_4hh_b.to(
+                get_accelerator().current_device_name())
+            self.module.res_coef.data = self._res_coef.to(
+                get_accelerator().current_device_name())
diff --git a/deepspeed/module_inject/containers/bert.py b/deepspeed/module_inject/containers/bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..95d8b485a9d602a41a61088aa79b50e991ea888e
--- /dev/null
+++ b/deepspeed/module_inject/containers/bert.py
@@ -0,0 +1,81 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from deepspeed.model_implementations.transformers.ds_bert import DeepSpeedBERTInference
+import torch
+from torch.nn.parameter import Parameter
+from ..policy import TransformerPolicy
+
+
+class DS_BERTContainer(BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+        self.return_tuple = True
+        self.triangular_masking = False
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+        self.module = DeepSpeedBERTInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+        return self.module
+
+
+class HFBertLayerPolicy(TransformerPolicy):
+    def __init__(self, client_module, inference=False):
+        super().__init__(inference, pre_attn_norm=False)
+        self.client_module = client_module
+        self.cuda_graph_supported = True
+
+        if HFBertLayerPolicy._orig_layer_class is None:
+            try:
+                import transformers
+                HFBertLayerPolicy._orig_layer_class = [
+                    transformers.models.bert.modeling_bert.BertLayer,
+                    transformers.models.roberta.modeling_roberta.RobertaLayer
+                ]
+            except:
+                HFBertLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attention.self.query.weight.shape[1], \
+                self.client_module.attention.self.num_attention_heads
+
+    def attention(self):
+        qw = self.client_module.attention.self.query.weight
+        qb = self.client_module.attention.self.query.bias
+        kw = self.client_module.attention.self.key.weight
+        kb = self.client_module.attention.self.key.bias
+        vw = self.client_module.attention.self.value.weight
+        vb = self.client_module.attention.self.value.bias
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
+        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=False)
+
+        return qkvw, \
+               qkvb, \
+               self.client_module.attention.output.dense.weight, \
+               self.client_module.attention.output.dense.bias, \
+
+    def mlp(self):
+        if self.pre_attn_norm:
+            intermediate_ff = self.client_module.intermediate.dense_act
+        else:
+            intermediate_ff = self.client_module.intermediate.dense
+
+        return intermediate_ff.weight, intermediate_ff.bias, \
+            self.client_module.output.dense.weight, \
+            self.client_module.output.dense.bias
+
+    def layernorm(self):
+        if self.pre_attn_norm:
+            attention_layernorm = self.client_module.PostAttentionLayerNorm
+            transformer_layernorm = self.client_module.PreAttentionLayerNorm
+        else:
+            attention_layernorm = self.client_module.attention.output.LayerNorm
+            transformer_layernorm = self.client_module.output.LayerNorm
+        return attention_layernorm.weight, \
+               attention_layernorm.bias, \
+               transformer_layernorm.weight, \
+               transformer_layernorm.bias
diff --git a/deepspeed/module_inject/containers/bloom.py b/deepspeed/module_inject/containers/bloom.py
new file mode 100644
index 0000000000000000000000000000000000000000..eedf85144fddb7ce59d5a23f69dda1b65ade9fb9
--- /dev/null
+++ b/deepspeed/module_inject/containers/bloom.py
@@ -0,0 +1,128 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from .features.meta_tensor import MetaTensorContainer
+from deepspeed.model_implementations.transformers.ds_bloom import DeepSpeedBloomInference
+from ..policy import TransformerPolicy
+from ..policy import transformer_param_names
+from ..policy import maybe_copy
+
+supported_models = {None}
+
+
+class DS_BloomContainer(MetaTensorContainer, BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+        self.bigscience_bloom = True
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+
+        self.module = DeepSpeedBloomInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+        return self.module
+
+    def attention_qkv_mp(self, mp_replace):
+        self.module.attention.attn_qkvw = mp_replace.copy(
+            self.module.attention.attn_qkvw,
+            self.qkvw)
+        self.module.attention.attn_qkvb = mp_replace.copy(
+            self.module.attention.attn_qkvb,
+            self.qkvb)
+
+    def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
+        param_names = (
+            'self_attention.query_key_value.weight', \
+            'self_attention.query_key_value.bias', \
+            'self_attention.dense.weight', \
+            'self_attention.dense.bias', \
+            'mlp.dense_h_to_4h.weight', \
+            'mlp.dense_h_to_4h.bias', \
+            'mlp.dense_4h_to_h.weight', \
+            'mlp.dense_4h_to_h.bias', \
+            'post_attention_layernorm.weight', \
+            'post_attention_layernorm.bias', \
+            'input_layernorm.weight', \
+            'input_layernorm.bias'
+        )
+        for i in range(0, 2):
+            maybe_copy(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i],
+                       prefix + param_names[i],
+                       qkv=True,
+                       megatron_v2=self.policy.is_megatron_v2,
+                       split_qkv=self.policy.split_qkv)
+        for i in range(2, 4):
+            maybe_copy(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i],
+                       prefix + param_names[i])
+        for i in range(4, 10):
+            maybe_copy(module.mlp,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i],
+                       prefix + param_names[i])
+        for i in range(10, 12):
+            maybe_copy(module,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i],
+                       prefix + param_names[i])
+
+
+class BLOOMLayerPolicy(TransformerPolicy):
+    _orig_layer_class = None
+
+    def __init__(self,
+                 client_module,
+                 inference=True,
+                 use_load_prefix=True,
+                 split_qkv=False):
+        super().__init__(inference,
+                         linear_layer=True,
+                         use_load_prefix=use_load_prefix,
+                         split_qkv=split_qkv)
+        self.client_module = client_module
+        try:
+            import transformers
+            BLOOMLayerPolicy._orig_layer_class = transformers.models.bloom.modeling_bloom.BloomBlock
+            global supported_models
+            supported_models.update(
+                {transformers.models.bloom.modeling_bloom.BloomModel})
+        except Exception as e:
+            print(
+                f"WARNING! Setting BLOOMLayerPolicy._orig_layer_class to None due to Exception: {e}"
+            )
+            BLOOMLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.self_attention.hidden_size, \
+                self.client_module.self_attention.num_heads
+
+    def attention(self):
+        return self.client_module.self_attention.query_key_value.weight, \
+                self.client_module.self_attention.query_key_value.bias, \
+                self.client_module.self_attention.dense.weight, \
+                self.client_module.self_attention.dense.bias,
+
+    def mlp(self):
+        return self.client_module.mlp.dense_h_to_4h.weight, \
+               self.client_module.mlp.dense_h_to_4h.bias, \
+               self.client_module.mlp.dense_4h_to_h.weight, \
+               self.client_module.mlp.dense_4h_to_h.bias
+
+    def layernorm(self):
+        return self.client_module.post_attention_layernorm.weight, \
+               self.client_module.post_attention_layernorm.bias, \
+               self.client_module.input_layernorm.weight, \
+               self.client_module.input_layernorm.bias
diff --git a/deepspeed/module_inject/containers/clip.py b/deepspeed/module_inject/containers/clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e6975091c034804175facd9faeb3cc757d85f7b
--- /dev/null
+++ b/deepspeed/module_inject/containers/clip.py
@@ -0,0 +1,66 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
+import torch
+from torch.nn.parameter import Parameter
+from ..policy import TransformerPolicy
+
+
+class DS_CLIPContainer(BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+        self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+        return self.module
+
+
+class HFCLIPLayerPolicy(TransformerPolicy):
+    def __init__(self, client_module, inference=False):
+        super().__init__(inference, pre_attn_norm=True, scale_attention=True)
+        self.client_module = client_module
+        self.cuda_graph_supported = True
+
+        if HFCLIPLayerPolicy._orig_layer_class is None:
+            try:
+                import transformers
+                HFCLIPLayerPolicy._orig_layer_class = transformers.models.clip.modeling_clip.CLIPEncoderLayer
+            except:
+                HFCLIPLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.self_attn.q_proj.weight.shape[1], \
+                self.client_module.self_attn.num_heads
+
+    def attention(self):
+        qw = self.client_module.self_attn.q_proj.weight
+        qb = self.client_module.self_attn.q_proj.bias
+        kw = self.client_module.self_attn.k_proj.weight
+        kb = self.client_module.self_attn.k_proj.bias
+        vw = self.client_module.self_attn.v_proj.weight
+        vb = self.client_module.self_attn.v_proj.bias
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
+        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=False)
+
+        return qkvw, \
+               qkvb, \
+               self.client_module.self_attn.out_proj.weight, \
+               self.client_module.self_attn.out_proj.bias
+
+    def mlp(self):
+        return self.client_module.mlp.fc1.weight, \
+               self.client_module.mlp.fc1.bias, \
+               self.client_module.mlp.fc2.weight, \
+               self.client_module.mlp.fc2.bias
+
+    def layernorm(self):
+        return self.client_module.layer_norm2.weight, \
+               self.client_module.layer_norm2.bias, \
+               self.client_module.layer_norm1.weight, \
+               self.client_module.layer_norm1.bias
diff --git a/deepspeed/module_inject/containers/distil_bert.py b/deepspeed/module_inject/containers/distil_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..71f46dc8ff12f99d3f9666ed93b6c6e0aa9abe78
--- /dev/null
+++ b/deepspeed/module_inject/containers/distil_bert.py
@@ -0,0 +1,75 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from deepspeed.model_implementations.transformers.ds_bert import DeepSpeedBERTInference
+import torch
+from torch.nn.parameter import Parameter
+from ..policy import TransformerPolicy
+
+
+class DS_DistilBERTContainer(BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+        self.triangular_masking = False
+        self.return_single_tuple = True
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+        self.module = DeepSpeedBERTInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+        return self.module
+
+
+class HFDistilBertLayerPolicy(TransformerPolicy):
+    _orig_layer_class = None
+
+    def __init__(self, client_module, inference=False, preln=False):
+        super().__init__(inference)
+        self.client_module = client_module
+        self.preln = preln
+        self.cuda_graph_supported = True
+        if HFDistilBertLayerPolicy._orig_layer_class is None:
+            try:
+                import transformers
+                HFDistilBertLayerPolicy._orig_layer_class = [
+                    transformers.models.distilbert.modeling_distilbert.TransformerBlock,
+                ]
+            except:
+                HFDistilBertLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attention.q_lin.weight.shape[1], \
+                self.client_module.attention.n_heads
+
+    def attention(self):
+        qw = self.client_module.attention.q_lin.weight
+        qb = self.client_module.attention.q_lin.bias
+        kw = self.client_module.attention.k_lin.weight
+        kb = self.client_module.attention.k_lin.bias
+        vw = self.client_module.attention.v_lin.weight
+        vb = self.client_module.attention.v_lin.bias
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0))
+        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0))
+
+        return qkvw, \
+               qkvb, \
+               self.client_module.attention.out_lin.weight, \
+               self.client_module.attention.out_lin.bias
+
+    def mlp(self):
+        intermediate_ff = self.client_module.ffn.lin1
+
+        return intermediate_ff.weight, intermediate_ff.bias, \
+            self.client_module.ffn.lin2.weight, \
+            self.client_module.ffn.lin2.bias
+
+    def layernorm(self):
+        attention_layernorm = self.client_module.sa_layer_norm
+        transformer_layernorm = self.client_module.output_layer_norm
+        return attention_layernorm.weight, \
+               attention_layernorm.bias, \
+               transformer_layernorm.weight, \
+               transformer_layernorm.bias
diff --git a/deepspeed/module_inject/containers/features/__init__.py b/deepspeed/module_inject/containers/features/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bd29647d48a3032c6af9304ed3dac2d72eaa376
--- /dev/null
+++ b/deepspeed/module_inject/containers/features/__init__.py
@@ -0,0 +1,4 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .megatron import MegatronContainer
+from .meta_tensor import MetaTensorContainer
diff --git a/deepspeed/module_inject/containers/features/megatron.py b/deepspeed/module_inject/containers/features/megatron.py
new file mode 100644
index 0000000000000000000000000000000000000000..45a013be913e5bc44d4a14fe3f35462c6eab49c5
--- /dev/null
+++ b/deepspeed/module_inject/containers/features/megatron.py
@@ -0,0 +1,37 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from abc import ABC
+
+
+class MegatronContainer(ABC):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.megatron_v2 = self.policy.is_megatron_v2
+
+    def transpose_qkv_alignment(self, x):
+        attention_head_size = x.shape[-1] // self.num_attention_heads
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, attention_head_size)
+        x_1 = x.view(*new_x_shape)
+        (q, k, v) = torch.split(x_1, (x_1.shape[-1] // 3), dim=(x_1.dim() - 1))
+        if len(q.shape) > 2:
+            return torch.cat((q.reshape(q.shape[0],
+                                        -1),
+                              k.reshape(q.shape[0],
+                                        -1),
+                              v.reshape(q.shape[0],
+                                        -1)),
+                             dim=-1).reshape(x.shape)
+        else:
+            return torch.cat((q.reshape(-1),
+                              k.reshape(-1),
+                              v.reshape(-1)),
+                             dim=-1).reshape(x.shape)
+
+    def transpose(self):
+        super().transpose()
+        if self.megatron_v2:
+            self.qkvw = torch.nn.parameter.Parameter(
+                self.transpose_qkv_alignment(self.qkvw).contiguous())
+            self.qkvb = torch.nn.parameter.Parameter(
+                self.transpose_qkv_alignment(self.qkvb).contiguous())
diff --git a/deepspeed/module_inject/containers/features/meta_tensor.py b/deepspeed/module_inject/containers/features/meta_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b63c5cf5fbddec829027ca349006835b150581e
--- /dev/null
+++ b/deepspeed/module_inject/containers/features/meta_tensor.py
@@ -0,0 +1,58 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from abc import ABC, abstractmethod
+
+
+class MetaTensorContainer(ABC):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.is_meta = False
+        self.ckpt_load_enabled = True
+
+    def initialize_tensors(self):
+        super().initialize_tensors()
+        self.is_meta = self.qkvw.is_meta
+
+    def apply_tensor_parallelism(self, mp_replace):
+        if self.is_meta:
+            if self.qkvb is None:
+                self.module.attention.attn_qkvb = None
+            if self.dense_b is None:
+                self.module.attention.attn_ob = None
+        else:
+            super().apply_tensor_parallelism(mp_replace)
+
+    def copy_data_to_new_module(self):
+        if self.is_meta:
+            if self.attn_nw is None:
+                self.module.mlp.attn_nw = self.attn_nw
+                self.module.mlp.attn_nb = self.attn_nb
+        else:
+            super().copy_data_to_new_module()
+
+    def transpose(self):
+        if not self.is_meta:
+            super().transpose()
+
+    @abstractmethod
+    def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
+        """
+        Load all the transformer parameter from the checkpoint file (sd).
+        In addition to the parameter names, we require two
+        more parameters to help read the the data correctly
+        from the checkpoint and split the qkv heads in the
+        right order:
+            1. `use_load_prefix` (Default: False): this specifies
+                whether we need to use the name of first abstraction
+                layer of the model for searching the parameter's name
+                in a checkpoint file. For more information of how this
+                is used please see
+                https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/load_checkpoint.py
+            2. `split_qkv` (Default: True): we use this flag when splitting
+                the qkv parameter into heads. If it is False, it means the heads
+                of q, k, and v are stored together and needs to split in the
+                DeepSpeed-Inference API.
+        """
+        raise NotImplementedError(
+            "A load_params() function must be defined in the model container \
+                                  when inheriting the MetaTensorContainer feature")
diff --git a/deepspeed/module_inject/containers/gpt2.py b/deepspeed/module_inject/containers/gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc194d71d45997b2c0b7f8f7966611f67e5ea4e6
--- /dev/null
+++ b/deepspeed/module_inject/containers/gpt2.py
@@ -0,0 +1,54 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
+from ..policy import TransformerPolicy
+
+
+class DS_GPT2Container(BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+        self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+        return self.module
+
+
+class HFGPT2LayerPolicy(TransformerPolicy):
+    _orig_layer_class = None
+
+    def __init__(self, client_module, inference=True):
+        # HuggingFace GPT2 uses convolutional layer instead of linear layer
+        super().__init__(inference, linear_layer=False)
+        self.client_module = client_module
+        try:
+            import transformers
+            HFGPT2LayerPolicy._orig_layer_class = transformers.models.gpt2.modeling_gpt2.GPT2Block
+        except:
+            HFGPT2LayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attn.embed_dim, \
+                self.client_module.attn.num_heads
+
+    def attention(self):
+        return  self.client_module.attn.c_attn.weight, \
+                self.client_module.attn.c_attn.bias, \
+                self.client_module.attn.c_proj.weight, \
+                self.client_module.attn.c_proj.bias
+
+    def mlp(self):
+        return self.client_module.mlp.c_fc.weight, \
+               self.client_module.mlp.c_fc.bias, \
+               self.client_module.mlp.c_proj.weight, \
+               self.client_module.mlp.c_proj.bias
+
+    def layernorm(self):
+        return self.client_module.ln_2.weight, \
+               self.client_module.ln_2.bias, \
+               self.client_module.ln_1.weight, \
+               self.client_module.ln_1.bias
diff --git a/deepspeed/module_inject/containers/gptj.py b/deepspeed/module_inject/containers/gptj.py
new file mode 100644
index 0000000000000000000000000000000000000000..35472c1007080f50c0fd7bdecf814b9ccca30de9
--- /dev/null
+++ b/deepspeed/module_inject/containers/gptj.py
@@ -0,0 +1,110 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from .features.meta_tensor import MetaTensorContainer
+from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
+import torch
+from torch.nn.parameter import Parameter
+from ..policy import TransformerPolicy
+from ..policy import transformer_param_names
+from ..policy import maybe_copy
+from ..policy import maybe_copy_qkv
+
+
+class DS_GPTJContainer(MetaTensorContainer, BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+        self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+        return self.module
+
+    def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
+        param_names = (
+            'attn.q_proj.weight', \
+            'attn.k_proj.weight', \
+            'attn.v_proj.weight', \
+            'attn.out_proj.weight', \
+            'mlp.fc_in.weight', \
+            'mlp.fc_in.bias', \
+            'mlp.fc_out.weight', \
+            'mlp.fc_out.bias', \
+            'ln_1.weight', \
+            'ln_1.bias'
+        )
+        maybe_copy_qkv(
+            module.attention,
+            sd,
+            weight_quantizer,
+            mp_replace,
+            'attn_qkvw',
+            [prefix + param_names[0],
+             prefix + param_names[1],
+             prefix + param_names[2]],
+            split_qkv=self.policy.split_qkv)
+        for i in range(3, 4):
+            maybe_copy(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i - 1],
+                       prefix + param_names[i])
+        for i in range(4, 8):
+            maybe_copy(module.mlp,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i],
+                       prefix + param_names[i])
+        for i in range(8, 10):
+            maybe_copy(module,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i + 2],
+                       prefix + param_names[i])
+
+
+class HFGPTJLayerPolicy(TransformerPolicy):
+    _orig_layer_class = None
+
+    def __init__(self, client_module, inference=True):
+        super().__init__(inference, scale_attention=True)
+        self.client_module = client_module
+        try:
+            import transformers
+            HFGPTJLayerPolicy._orig_layer_class = transformers.models.gptj.modeling_gptj.GPTJBlock
+        except:
+            HFGPTJLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attn.q_proj.weight.shape[1], \
+                self.client_module.attn.num_attention_heads
+
+    def attention(self):
+        qw = self.client_module.attn.q_proj.weight
+        kw = self.client_module.attn.k_proj.weight
+        vw = self.client_module.attn.v_proj.weight
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
+
+        return qkvw, \
+               None, \
+               self.client_module.attn.out_proj.weight, \
+               None,
+
+    def mlp(self):
+        return self.client_module.mlp.fc_in.weight, \
+               self.client_module.mlp.fc_in.bias, \
+               self.client_module.mlp.fc_out.weight, \
+               self.client_module.mlp.fc_out.bias
+
+    def layernorm(self):
+        return None, \
+               None, \
+               self.client_module.ln_1.weight, \
+               self.client_module.ln_1.bias
diff --git a/deepspeed/module_inject/containers/gptneo.py b/deepspeed/module_inject/containers/gptneo.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8f206f516dc69dc6414daa9699a157718e98345
--- /dev/null
+++ b/deepspeed/module_inject/containers/gptneo.py
@@ -0,0 +1,111 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from .features.meta_tensor import MetaTensorContainer
+from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
+import torch
+from torch.nn.parameter import Parameter
+from ..policy import TransformerPolicy
+from ..policy import transformer_param_names
+from ..policy import maybe_copy
+from ..policy import maybe_copy_qkv
+
+
+class DS_GPTNEOContainer(MetaTensorContainer, BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+        self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+        return self.module
+
+    def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
+        param_names = (
+            'attn.attention.q_proj.weight', \
+            'attn.attention.k_proj.weight', \
+            'attn.attention.v_proj.weight', \
+            'attn.attention.out_proj.weight', \
+            'attn.attention.out_proj.bias', \
+            'mlp.c_fc.weight', \
+            'mlp.c_fc.bias', \
+            'mlp.c_proj.weight', \
+            'mlp.c_proj.bias', \
+            'ln_2.weight', \
+            'ln_2.bias', \
+            'ln_1.weight', \
+            'ln_1.bias'
+        )
+        maybe_copy_qkv(
+            module.attention,
+            sd,
+            weight_quantizer,
+            mp_replace,
+            'attn_qkvw',
+            [prefix + param_names[0],
+             prefix + param_names[1],
+             prefix + param_names[2]],
+            split_qkv=self.policy.split_qkv)
+        for i in range(3, 5):
+            maybe_copy(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i - 1],
+                       prefix + param_names[i])
+        for i in range(5, 11):
+            maybe_copy(module.mlp,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i - 1],
+                       prefix + param_names[i])
+        for i in range(11, 13):
+            maybe_copy(module,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i - 1],
+                       prefix + param_names[i])
+
+
+class HFGPTNEOLayerPolicy(TransformerPolicy):
+    def __init__(self, client_module, inference=True):
+        super().__init__(inference, scale_attention=False)
+        self.client_module = client_module
+        try:
+            import transformers
+            HFGPTNEOLayerPolicy._orig_layer_class = transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoBlock
+        except:
+            HFGPTNEOLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attn.attention.q_proj.weight.shape[1], \
+                self.client_module.attn.attention.num_heads
+
+    def attention(self):
+        qw = self.client_module.attn.attention.q_proj.weight
+        kw = self.client_module.attn.attention.k_proj.weight
+        vw = self.client_module.attn.attention.v_proj.weight
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
+
+        return qkvw, \
+               None, \
+               self.client_module.attn.attention.out_proj.weight, \
+               self.client_module.attn.attention.out_proj.bias
+
+    def mlp(self):
+        return self.client_module.mlp.c_fc.weight, \
+               self.client_module.mlp.c_fc.bias, \
+               self.client_module.mlp.c_proj.weight, \
+               self.client_module.mlp.c_proj.bias
+
+    def layernorm(self):
+        return self.client_module.ln_2.weight, \
+               self.client_module.ln_2.bias, \
+               self.client_module.ln_1.weight, \
+               self.client_module.ln_1.bias
diff --git a/deepspeed/module_inject/containers/gptneox.py b/deepspeed/module_inject/containers/gptneox.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebf2db0251cca1ded5adf30bbee4ee1a04787313
--- /dev/null
+++ b/deepspeed/module_inject/containers/gptneox.py
@@ -0,0 +1,129 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from .features.meta_tensor import MetaTensorContainer
+from .features.megatron import MegatronContainer
+from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
+import torch
+from ..policy import TransformerPolicy
+from ..policy import transformer_param_names
+from ..policy import maybe_copy
+from packaging import version as pkg_version
+
+
+class DS_GPTNEOXContainer(MetaTensorContainer,
+                          MegatronContainer,
+                          BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+        self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+
+        if self.megatron_v2:
+            self.module.config.rotate_half = True
+            self.module.config.rotate_every_two = False
+
+        return self.module
+
+    def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
+        param_names = (
+            'attention.query_key_value.weight', \
+            'attention.query_key_value.bias', \
+            'attention.dense.weight', \
+            'attention.dense.bias', \
+            'mlp.dense_h_to_4h.weight', \
+            'mlp.dense_h_to_4h.bias', \
+            'mlp.dense_4h_to_h.weight', \
+            'mlp.dense_4h_to_h.bias', \
+            'post_attention_layernorm.weight', \
+            'post_attention_layernorm.bias', \
+            'input_layernorm.weight', \
+            'input_layernorm.bias'
+        )
+        for i in range(0, 2):
+            maybe_copy(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i],
+                       prefix + param_names[i],
+                       qkv=True,
+                       megatron_v2=self.policy.is_megatron_v2,
+                       split_qkv=self.policy.split_qkv,
+                       heads=self.policy.client_module.attention.num_attention_heads)
+        for i in range(2, 4):
+            maybe_copy(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i],
+                       prefix + param_names[i])
+        for i in range(4, 10):
+            maybe_copy(module.mlp,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i],
+                       prefix + param_names[i])
+        for i in range(10, 12):
+            maybe_copy(module,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i],
+                       prefix + param_names[i])
+
+
+class GPTNEOXLayerPolicy(TransformerPolicy):
+    _orig_layer_class = None
+    version = 0
+
+    def __init__(self, client_module, inference=True, megatron_v2=True, split_qkv=False):
+        super().__init__(inference, megatron_v2=megatron_v2, split_qkv=split_qkv)
+        self.client_module = client_module
+        if GPTNEOXLayerPolicy._orig_layer_class is None:
+            if pkg_version.parse(torch.__version__) <= pkg_version.parse("1.2"):
+                GPTNEOXLayerPolicy._orig_layer_class = None
+            else:
+                try:
+                    from transformers import GPTNeoXLayer
+                    GPTNEOXLayerPolicy._orig_layer_class = GPTNeoXLayer
+                except ImportError:
+                    GPTNEOXLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        if GPTNEOXLayerPolicy.version == 0:
+            attention = self.client_module.attention
+        else:
+            attention = self.client_module.self_attention
+
+        return self.client_module.attention.query_key_value.weight.shape[1], \
+                self.client_module.attention.num_attention_heads
+
+    def attention(self):
+        if GPTNEOXLayerPolicy.version == 0:
+            attention = self.client_module.attention
+        else:
+            attention = self.client_module.self_attention
+
+        return attention.query_key_value.weight, \
+               attention.query_key_value.bias, \
+               attention.dense.weight, \
+               attention.dense.bias
+
+    def mlp(self):
+        return self.client_module.mlp.dense_h_to_4h.weight, \
+               self.client_module.mlp.dense_h_to_4h.bias, \
+               self.client_module.mlp.dense_4h_to_h.weight, \
+               self.client_module.mlp.dense_4h_to_h.bias
+
+    def layernorm(self):
+        return self.client_module.post_attention_layernorm.weight, \
+               self.client_module.post_attention_layernorm.bias, \
+               self.client_module.input_layernorm.weight, \
+               self.client_module.input_layernorm.bias
diff --git a/deepspeed/module_inject/containers/megatron_gpt.py b/deepspeed/module_inject/containers/megatron_gpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a8db9108f38c42d1929886108838f9b248a4b22
--- /dev/null
+++ b/deepspeed/module_inject/containers/megatron_gpt.py
@@ -0,0 +1,106 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from .features.megatron import MegatronContainer
+from deepspeed.model_implementations.transformers.ds_megatron_gpt import DeepSpeedMegatronGPTInference
+import torch
+from ..policy import TransformerPolicy
+from packaging import version as pkg_version
+
+
+class DS_MegatronGPTContainer(MegatronContainer, BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+        self.module = DeepSpeedMegatronGPTInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+
+        if self.megatron_v2:
+            self.module.config.rotate_half = True
+            self.module.config.rotate_every_two = False
+
+        return self.module
+
+
+# TODO: Megatron GPT MoE inherits from Megatron policy and replaces mlp
+# TODO: Generalize MoE overall goal, expand beyond Megatron
+class MegatronLayerPolicy(TransformerPolicy):
+    _orig_layer_class = None
+    version = 0
+    moe_type = 'standard'
+    megatron_v2 = True
+    use_mup = False
+
+    def __init__(self, client_module, inference=True):
+        super().__init__(inference,
+                         megatron_v2=MegatronLayerPolicy.megatron_v2,
+                         use_mup=MegatronLayerPolicy.use_mup)
+        self.client_module = client_module
+        # we use megatron version to differentiate between the old and new
+        # megatron-lm source code
+        if MegatronLayerPolicy._orig_layer_class is None:
+            if pkg_version.parse(torch.__version__) <= pkg_version.parse("1.2"):
+                MegatronLayerPolicy._orig_layer_class = None
+            else:
+                try:
+                    from megatron.model.transformer import ParallelTransformerLayer
+                    MegatronLayerPolicy._orig_layer_class = ParallelTransformerLayer
+                except ImportError:
+                    MegatronLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attention.query_key_value.weight.shape[1], \
+                self.client_module.attention.num_attention_heads
+
+    def attention(self):
+        if self.inference:
+            if MegatronLayerPolicy.version == 0:
+                attention = self.client_module.attention
+            else:
+                attention = self.client_module.self_attention
+
+        return attention.query_key_value.weight, \
+               attention.query_key_value.bias, \
+               attention.dense.weight, \
+               attention.dense.bias
+
+    def mlp(self, moe_type='standard'):
+        from deepspeed.moe.utils import has_moe_layers
+        moe, _ = has_moe_layers(self.client_module)
+
+        if moe:
+            moe_experts = self.client_module.mlp.deepspeed_moe.experts.deepspeed_experts if moe_type == 'standard' else \
+                            self.client_module.mlp.moe.deepspeed_moe.experts.deepspeed_experts
+            num_experts = len(moe_experts)
+            if moe_type == 'standard':
+                return [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
+                       [moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
+                       [moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
+                       [moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)]
+            else:
+
+                return [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
+                       [moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
+                       [moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
+                       [moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)], \
+                       self.client_module.mlp.mlp.dense_h_to_4h.weight, \
+                       self.client_module.mlp.mlp.dense_h_to_4h.bias, \
+                       self.client_module.mlp.mlp.dense_4h_to_h.weight, \
+                       self.client_module.mlp.mlp.dense_4h_to_h.bias, \
+                       self.client_module.mlp.coefficient.weight
+
+        else:
+            return self.client_module.mlp.dense_h_to_4h.weight, \
+                   self.client_module.mlp.dense_h_to_4h.bias, \
+                   self.client_module.mlp.dense_4h_to_h.weight, \
+                   self.client_module.mlp.dense_4h_to_h.bias
+
+    def layernorm(self):
+        return self.client_module.post_attention_layernorm.weight, \
+               self.client_module.post_attention_layernorm.bias, \
+               self.client_module.input_layernorm.weight, \
+               self.client_module.input_layernorm.bias
diff --git a/deepspeed/module_inject/containers/megatron_gpt_moe.py b/deepspeed/module_inject/containers/megatron_gpt_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..2968161305c453a2611bc91e2a7b4dbbad27aa91
--- /dev/null
+++ b/deepspeed/module_inject/containers/megatron_gpt_moe.py
@@ -0,0 +1,82 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from .base_moe import *
+from .features.megatron import MegatronContainer
+from deepspeed.model_implementations.transformers.ds_megatron_gpt import DeepSpeedMegatronGPTInference
+import torch
+from .megatron_gpt import MegatronLayerPolicy
+from packaging import version as pkg_version
+
+
+class DS_MegatronGPTMoEContainer(MegatronContainer, BaseTransformerMoEContainer):
+    def __init__(self, policy, config, model_config, layer_id):
+        super().__init__(policy, config, model_config, layer_id)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+        self.module = DeepSpeedMegatronGPTInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+
+        if self.megatron_v2:
+            self.module.config.rotate_half = True
+            self.module.config.rotate_every_two = False
+
+        return self.module
+
+
+# TODO: Megatron GPT MoE inherits from Megatron policy and replaces mlp
+# TODO: Generalize MoE overall goal, expand beyond Megatron
+class MegatronMoELayerPolicy(MegatronLayerPolicy):
+    _orig_layer_class = None
+    version = 0
+    moe_type = 'standard'
+    num_experts = 1
+
+    def __init__(self, client_module, inference=True):
+        super().__init__(inference)
+        self.client_module = client_module
+        # we use megatron version to differentiate between the old and new
+        # megatron-lm source code
+        if MegatronMoELayerPolicy._orig_layer_class is None:
+            if pkg_version.parse(torch.__version__) <= pkg_version.parse("1.2"):
+                MegatronMoELayerPolicy._orig_layer_class = None
+            else:
+                try:
+                    from megatron.model.transformer import ParallelTransformerLayer
+                    MegatronMoELayerPolicy._orig_layer_class = ParallelTransformerLayer
+                except ImportError:
+                    MegatronMoELayerPolicy._orig_layer_class = None
+
+    def get_num_experts(self):
+        return self.num_experts
+
+    def mlp(self, moe_type='standard'):
+        # for now, all of this is tightly coupled to megatron-deepspeed moe implementation
+        # todo: think and refactor this to be more general
+
+        #from deepspeed.moe.utils import has_moe_layers
+        #moe, _ = has_moe_layers(self.client_module)
+
+        moe_experts = self.client_module.mlp.deepspeed_moe.experts.deepspeed_experts if moe_type == 'standard' else \
+                        self.client_module.mlp.moe.deepspeed_moe.experts.deepspeed_experts
+        num_experts = len(moe_experts)
+        self.num_experts = num_experts
+
+        if moe_type == 'standard':
+            return [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
+                    [moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
+                    [moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
+                    [moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)]
+        else:
+            return [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
+                    [moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
+                    [moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
+                    [moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)], \
+                    self.client_module.mlp.mlp.dense_h_to_4h.weight, \
+                    self.client_module.mlp.mlp.dense_h_to_4h.bias, \
+                    self.client_module.mlp.mlp.dense_4h_to_h.weight, \
+                    self.client_module.mlp.mlp.dense_4h_to_h.bias, \
+                    self.client_module.mlp.coefficient.weight
diff --git a/deepspeed/module_inject/containers/opt.py b/deepspeed/module_inject/containers/opt.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f9c30bcac8c60beca8dffb388f947a55774e2fe
--- /dev/null
+++ b/deepspeed/module_inject/containers/opt.py
@@ -0,0 +1,134 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from .features.meta_tensor import MetaTensorContainer
+from deepspeed.model_implementations.transformers.ds_opt import DeepSpeedOPTInference
+import torch
+from torch.nn.parameter import Parameter
+from ..policy import TransformerPolicy
+from ..policy import transformer_param_names
+from ..policy import maybe_copy
+from ..policy import maybe_copy_qkv
+from deepspeed.utils.types import ActivationFuncType
+
+
+class DS_OPTContainer(MetaTensorContainer, BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+        self.module = DeepSpeedOPTInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+        return self.module
+
+    def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
+        param_names = (
+            'self_attn.q_proj.weight', \
+            'self_attn.k_proj.weight', \
+            'self_attn.v_proj.weight', \
+            'self_attn.q_proj.bias', \
+            'self_attn.k_proj.bias', \
+            'self_attn.v_proj.bias', \
+            'self_attn.out_proj.weight', \
+            'self_attn.out_proj.bias', \
+            'fc1.weight', \
+            'fc1.bias', \
+            'fc2.weight', \
+            'fc2.bias', \
+            'final_layer_norm.weight', \
+            'final_layer_norm.bias', \
+            'self_attn_layer_norm.weight', \
+            'self_attn_layer_norm.bias'
+        )
+
+        for i in range(0, 6, 3):
+            maybe_copy_qkv(module.attention,
+                           sd,
+                           weight_quantizer,
+                           mp_replace,
+                           transformer_param_names[i // 3],
+                           [
+                               prefix + param_names[i],
+                               prefix + param_names[i + 1],
+                               prefix + param_names[i + 2]
+                           ],
+                           split_qkv=self.policy.split_qkv)
+        for i in range(6, 8):
+            maybe_copy(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i - 4],
+                       prefix + param_names[i])
+        for i in range(8, 14):
+            maybe_copy(module.mlp,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i - 4],
+                       prefix + param_names[i])
+        for i in range(14, 16):
+            maybe_copy(module,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i - 4],
+                       prefix + param_names[i])
+
+
+class HFOPTLayerPolicy(TransformerPolicy):
+    _orig_layer_class = None
+
+    def __init__(self, client_module, inference=True, use_load_prefix=True):
+        super().__init__(inference,
+                         linear_layer=True,
+                         mlp_act_func_type=ActivationFuncType.ReLU,
+                         pre_attn_norm=True,
+                         use_load_prefix=use_load_prefix)
+        self.client_module = client_module
+
+        try:
+            import transformers
+            HFOPTLayerPolicy._orig_layer_class = transformers.models.opt.modeling_opt.OPTDecoderLayer
+            if isinstance(TransformerPolicy.hf_model_config,
+                          transformers.models.opt.configuration_opt.OPTConfig):
+                self.pre_attn_norm = TransformerPolicy.hf_model_config.do_layer_norm_before
+        except:
+            HFOPTLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.self_attn.embed_dim, \
+                self.client_module.self_attn.num_heads
+
+    def attention(self):
+        qw = self.client_module.self_attn.q_proj.weight
+        qb = self.client_module.self_attn.q_proj.bias
+
+        kw = self.client_module.self_attn.k_proj.weight
+        kb = self.client_module.self_attn.k_proj.bias
+
+        vw = self.client_module.self_attn.v_proj.weight
+        vb = self.client_module.self_attn.v_proj.bias
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
+        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=False)
+
+        return qkvw, \
+               qkvb, \
+               self.client_module.self_attn.out_proj.weight, \
+               self.client_module.self_attn.out_proj.bias
+
+    def mlp(self):
+        return self.client_module.fc1.weight, \
+               self.client_module.fc1.bias, \
+               self.client_module.fc2.weight, \
+               self.client_module.fc2.bias
+
+    def layernorm(self):
+        return self.client_module.final_layer_norm.weight, \
+               self.client_module.final_layer_norm.bias, \
+               self.client_module.self_attn_layer_norm.weight, \
+               self.client_module.self_attn_layer_norm.bias
diff --git a/deepspeed/module_inject/containers/unet.py b/deepspeed/module_inject/containers/unet.py
new file mode 100644
index 0000000000000000000000000000000000000000..461ca12bf36ac436cdb860b653476c6fa8d0c7a9
--- /dev/null
+++ b/deepspeed/module_inject/containers/unet.py
@@ -0,0 +1,51 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+import torch
+from torch.nn.parameter import Parameter
+
+from ..policy import DSPolicy
+from ...model_implementations.diffusers.unet import DSUNet
+
+
+class UNetPolicy(DSPolicy):
+    def __init__(self):
+        super().__init__()
+        try:
+            import diffusers
+            self._orig_layer_class = diffusers.models.unet_2d_condition.UNet2DConditionModel
+        except ImportError:
+            self._orig_layer_class = None
+
+    def match(self, module):
+        return isinstance(module, self._orig_layer_class)
+
+    def match_replaced(self, module):
+        return isinstance(module, DSUNet)
+
+    def apply(self, module, enable_cuda_graph=True):
+        # TODO(cmikeh2): Enable cuda graph should be an inference configuration
+        return DSUNet(module, enable_cuda_graph=enable_cuda_graph)
+
+    def attention(self, client_module):
+        qw = client_module.to_q.weight
+        kw = client_module.to_k.weight
+        vw = client_module.to_v.weight
+
+        if qw.shape[1] == kw.shape[1]:
+            qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
+
+            return qkvw, \
+                   client_module.to_out[0].weight, \
+                   client_module.to_out[0].bias, \
+                   qw.shape[-1], \
+                   client_module.heads
+        else:
+            #return None
+            #kvw = Parameter(torch.cat((kw, vw), dim=0), requires_grad=False)
+            return qw, \
+                   kw, vw, \
+                   client_module.to_out[0].weight, \
+                   client_module.to_out[0].bias, \
+                   qw.shape[-1], \
+                   client_module.heads
diff --git a/deepspeed/module_inject/containers/vae.py b/deepspeed/module_inject/containers/vae.py
new file mode 100644
index 0000000000000000000000000000000000000000..c873f9768a8795b6b5869692855e1d1dba7908a1
--- /dev/null
+++ b/deepspeed/module_inject/containers/vae.py
@@ -0,0 +1,33 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+from ..policy import DSPolicy
+from ...model_implementations.diffusers.vae import DSVAE
+
+
+class VAEPolicy(DSPolicy):
+    def __init__(self):
+        super().__init__()
+        try:
+            import diffusers
+            if hasattr(diffusers.models.vae, "AutoencoderKL"):
+                self._orig_layer_class = diffusers.models.vae.AutoencoderKL
+            else:
+                # Diffusers >= 0.12.0 changes location of AutoencoderKL
+                self._orig_layer_class = diffusers.models.autoencoder_kl.AutoencoderKL
+        except ImportError:
+            self._orig_layer_class = None
+
+    def match(self, module):
+        return isinstance(module, self._orig_layer_class)
+
+    def match_replaced(self, module):
+        return isinstance(module, DSVAE)
+
+    def apply(self, module, enable_cuda_graph=True):
+        # TODO(cmikeh2): Enable cuda graph should be an inference configuration
+        return DSVAE(module, enable_cuda_graph=enable_cuda_graph)
+
+    # NOTE (lekurile): Should we have a diffusers policy class?
+    def attention(self):
+        pass
diff --git a/deepspeed/module_inject/inject.py b/deepspeed/module_inject/inject.py
old mode 100644
new mode 100755
index a601ef10e1d2508172af7ce44c371aa23e5b70b1..384bb7279fdf02d346e2db14f4590e599b00e992
--- a/deepspeed/module_inject/inject.py
+++ b/deepspeed/module_inject/inject.py
@@ -1,3 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import copy
 import torch
 from deepspeed.ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..0636e89289d549f3666e19e27dc20d58a5ab9873
--- /dev/null
+++ b/deepspeed/module_inject/layers.py
@@ -0,0 +1,101 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from deepspeed import comm as dist
+from torch import nn
+from torch.nn import functional as F
+
+from torch.nn.parameter import Parameter
+from deepspeed.accelerator import get_accelerator
+
+
+class LinearAllreduce(nn.Module):
+    def __init__(self, weight, bias=None, mp_group=None):
+        super(LinearAllreduce, self).__init__()
+        self.weight = weight
+        self.bias = bias
+        self.mp_group = mp_group
+
+    def forward(self, input):
+        output = torch.matmul(input, self.weight.transpose(-1, -2))
+        if self.mp_group is not None:
+            dist.all_reduce(output, group=self.mp_group)
+        if self.bias is not None:
+            output += self.bias
+        return output
+
+
+class LinearLayer(nn.Module):
+    def __init__(self, weight_shape=None, dtype=torch.half, weight=None, bias=None):
+        super(LinearLayer, self).__init__()
+        if weight is not None:
+            self.weight = weight
+            self.bias = bias
+        else:
+            self.weight = Parameter(
+                torch.empty(weight_shape,
+                            dtype=dtype,
+                            device=get_accelerator().current_device_name()))
+
+            self.bias = Parameter(
+                torch.empty(weight_shape[0],
+                            dtype=dtype,
+                            device=get_accelerator().current_device_name())) \
+                if bias is not None else None
+
+    def forward(self, input):
+        output = torch.matmul(input, self.weight.transpose(-1, -2))
+        if self.bias is not None:
+            output += self.bias
+        return output
+
+
+class Normalize(nn.Module):
+    def __init__(self, dim, dtype=torch.float, eps=1e-5):
+        super(Normalize, self).__init__()
+        self.norm = nn.LayerNorm(dim,
+                                 eps=eps).to(dtype).to(
+                                     get_accelerator().current_device_name())
+        self.weight = self.norm.weight
+        self.bias = self.norm.bias
+
+    def forward(self, input):
+        return self.norm(input)
+
+
+class EmbeddingLayer(nn.Module):
+    def __init__(self, weight_shape, dtype=torch.half):
+        super(EmbeddingLayer, self).__init__()
+        self.weight = Parameter(
+            torch.empty(weight_shape[0],
+                        weight_shape[1],
+                        dtype=dtype,
+                        device=get_accelerator().current_device_name()))
+
+    def forward(self, input):
+        return F.embedding(input, self.weight)
+
+
+class OPTEmbedding(EmbeddingLayer):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+    def __init__(self, weight_shape):
+        # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models don't have this hack
+        self.offset = 2
+        super().__init__(weight_shape)
+
+    def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        attention_mask = attention_mask.long()
+
+        # create positions depending on attention_mask
+        positions = (torch.cumsum(attention_mask,
+                                  dim=1).type_as(attention_mask) *
+                     attention_mask).long() - 1
+
+        # cut positions if `past_key_values_length` is > 0
+        positions = positions[:, past_key_values_length:]
+
+        return super().forward(positions + self.offset)
diff --git a/deepspeed/module_inject/load_checkpoint.py b/deepspeed/module_inject/load_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff8f454d7247e0b10746fca759ea0729ee76fee0
--- /dev/null
+++ b/deepspeed/module_inject/load_checkpoint.py
@@ -0,0 +1,289 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from torch import nn
+from deepspeed.model_implementations.transformers.ds_bloom import DeepSpeedBloomInference
+from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
+from deepspeed.model_implementations.transformers.ds_bert import DeepSpeedBERTInference
+from deepspeed.model_implementations.transformers.ds_megatron_gpt import DeepSpeedMegatronGPTInference
+from deepspeed.model_implementations.transformers.ds_opt import DeepSpeedOPTInference
+
+import deepspeed.ops.transformer as transformer_inference
+from .layers import LinearLayer, Normalize, EmbeddingLayer, OPTEmbedding
+import torch
+import gc
+from deepspeed.accelerator import get_accelerator
+
+
+def load_model_with_checkpoint(r_module,
+                               sd,
+                               mp_replace,
+                               ckpt_type,
+                               ckpt_mp_size,
+                               weight_quantizer=None,
+                               rank=0,
+                               container=None):
+    error_msgs = []
+
+    def transpose(data):
+        with torch.no_grad():
+            data = data.contiguous()
+            data1 = data.transpose(-1, -2).reshape(-1)
+            data.reshape(-1).copy_(data1)
+            data1 = None
+        return data.reshape(data.shape[-1], data.shape[-2])
+
+    def load(module, prefix):
+        args = (sd[0], prefix, {}, True, [], [], error_msgs)
+
+        if hasattr(module, 'weight'):
+            module.weight = mp_replace.copy(module.weight.data, sd[0][prefix + 'weight'])
+        if prefix + 'bias' in sd[0].keys():
+            if module.bias.data.is_meta:
+                # meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
+                module.bias = torch.nn.parameter.Parameter(
+                    data=torch.empty_like(module.bias.data,
+                                          device="cpu"),
+                    requires_grad=module.bias.data.requires_grad)
+            module.bias = mp_replace.copy(module.bias.data, sd[0][prefix + 'bias'])
+        args = None
+        gc.collect()
+
+    def load_transformer_layer(module, prefix):
+        if ckpt_type == "tp":
+
+            def load_parameters(module, prefix):
+                for n, p in module.named_parameters():
+                    if prefix + n in sd[0] and len(n.split('.')) == 1:
+                        if type(sd[0][prefix + n]) is list:
+                            tmp_data, scale = sd[0][prefix + n]
+                            tmp_data = tmp_data
+                            scale = scale.to(get_accelerator().current_device_name())
+                            # set the quantizer number of groups using the checkpoint scale shape
+                            weight_quantizer.num_groups = scale.shape[0]
+                        else:
+                            tmp_data = sd[0][prefix + n].to(
+                                get_accelerator().current_device_name())
+                            scale = None
+                        src_shape = tmp_data.shape
+                        dst_shape = p.shape
+                        inner_dim = 1 if tmp_data.dtype == torch.int8 else 0
+                        outer_dim = 0 if tmp_data.dtype == torch.int8 else 1
+                        if (len(src_shape) == 2 and len(dst_shape) == 2):
+                            if (src_shape[inner_dim] == dst_shape[0]
+                                    and src_shape[outer_dim] == dst_shape[1]):
+                                if tmp_data.dtype != torch.int8:
+                                    p = weight_quantizer.quantize(
+                                        transpose(tmp_data) if weight_quantizer.
+                                        q_int8 else tmp_data)
+                                else:
+                                    p = torch.nn.parameter.Parameter(tmp_data,
+                                                                     requires_grad=False)
+                                    p.scale = scale
+                                setattr(module, n, p)
+                            else:
+                                dim = inner_dim if src_shape[inner_dim] != dst_shape[
+                                    0] else outer_dim
+                                dim1 = 0 if src_shape[inner_dim] != dst_shape[0] else 1
+                                if src_shape[dim] > dst_shape[dim1]:
+                                    weight_partition = torch.split(
+                                        tmp_data,
+                                        dst_shape[dim1],
+                                        dim=dim)[rank].to(
+                                            get_accelerator().current_device_name())
+                                    assert tmp_data.dtype != torch.int8 or scale.numel() > weight_quantizer.num_groups * (rank+1), \
+                                        '''ERROR: We require the quantization scales for larger TP-size when loading INT8 checkpoint!\
+                                           Please use the FP16 checkpoint to generate INT8 checkpoint with the sharding parameters!'''
+                                    scale = scale.view(
+                                        -1)[weight_quantizer.num_groups *
+                                            (rank + 1):].reshape(
+                                                weight_quantizer.num_groups,
+                                                -1).contiguous()
+                                else:
+                                    assert tmp_data.dtype != torch.int8, \
+                                        '''Merging of the checkpoints are not supported when using INT8 checkpoint! \
+                                          Please use a as many GPUs as TP-size for the checkpoint'''
+                                    all_data = [
+                                        sd[j][prefix +
+                                              n] if type(sd[j][prefix + n]) is list else
+                                        sd[j][prefix + n].to(
+                                            get_accelerator().current_device_name())
+                                        for j in range(len(sd))
+                                    ]
+                                    # Check if the weight tensor is for the QKV parameter
+                                    if src_shape[1] == (3 *
+                                                        src_shape[0]) // ckpt_mp_size:
+                                        qkv_size = src_shape[outer_dim] // 3
+                                        src_split = [
+                                            torch.split(src[0].data,
+                                                        qkv_size,
+                                                        dim=outer_dim)
+                                            for src in all_data
+                                        ]
+
+                                        weight_partition = torch.cat([
+                                            torch.cat([qkv_s[i] for qkv_s in src_split],
+                                                      axis=outer_dim)
+                                            for i in range(len(src_split[0]))
+                                        ],
+                                                                     dim=dim)
+                                    else:
+                                        weight_partition = torch.cat([
+                                            ad[0].to(
+                                                get_accelerator().current_device_name())
+                                            if type(ad) is list else ad
+                                            for ad in all_data
+                                        ],
+                                                                     dim=dim)
+                                    if tmp_data.dtype == torch.int8:
+                                        scale = torch.cat([
+                                            ad[1].to(
+                                                get_accelerator().current_device_name())
+                                            for ad in all_data
+                                        ],
+                                                          dim=dim)
+
+                                if tmp_data.dtype != torch.int8:
+                                    weight_partition = weight_quantizer.quantize(
+                                        transpose(weight_partition), \
+                                        parallel_dim=(0 if dim == 1 else 1)) if weight_quantizer.q_int8 else \
+                                        weight_quantizer.quantize(weight_partition)
+                                else:
+                                    weight_partition = torch.nn.parameter.Parameter(
+                                        weight_partition,
+                                        requires_grad=False)
+                                    weight_partition.scale = scale
+                                setattr(module, n, weight_partition)
+                        else:
+                            if src_shape[0] == dst_shape[0]:
+                                p.data.copy_(tmp_data)
+                            else:
+                                if src_shape[0] > dst_shape[0]:
+                                    bias_split = torch.split(
+                                        tmp_data,
+                                        dst_shape[-1])[rank].to(get_accelerator(
+                                        ).current_device_name()).contiguous()
+                                    p.data.copy_(bias_split)
+                                else:
+                                    # Check if the weight tensor is for the QKV parameter
+                                    if src_shape[0] == (3 * r_module.config.hidden_size
+                                                        ) // ckpt_mp_size:
+                                        qkv_size = src_shape[0] // 3
+                                        src_split = [
+                                            torch.split(sd[j][prefix + n],
+                                                        qkv_size,
+                                                        dim=0) for j in range(len(sd))
+                                        ]
+
+                                        p.data.copy_(
+                                            torch.cat(
+                                                [
+                                                    torch.cat([
+                                                        qkv_s[i] for qkv_s in src_split
+                                                    ],
+                                                              axis=0)
+                                                    for i in range(len(src_split[0]))
+                                                ],
+                                                dim=0).to(get_accelerator(
+                                                ).current_device_name()).contiguous())
+                                    else:
+                                        p.data.copy_(
+                                            torch.cat(
+                                                [
+                                                    sd[j][prefix + n]
+                                                    for j in range(len(sd))
+                                                ],
+                                                dim=0).to(get_accelerator(
+                                                ).current_device_name()).contiguous())
+
+            load_parameters(module, prefix)
+            for n, child in module.named_children():
+                load_parameters(child, prefix + n + '.')
+        else:
+            container.load_params(module, sd[0], weight_quantizer, mp_replace, prefix)
+
+    try:
+        import transformers
+        OPTLearnedPositionalEmbedding = transformers.models.opt.modeling_opt.OPTLearnedPositionalEmbedding
+    except:
+        OPTLearnedPositionalEmbedding = None
+    layer_policies = {
+        nn.Linear: load,
+        nn.Embedding: load,
+        nn.LayerNorm: load,
+        EmbeddingLayer: load,
+        LinearLayer: load,
+        Normalize: load,
+        transformer_inference.DeepSpeedTransformerInference: load_transformer_layer,
+        DeepSpeedBloomInference: load_transformer_layer,
+        DeepSpeedGPTInference: load_transformer_layer,
+        DeepSpeedBERTInference: load_transformer_layer,
+        DeepSpeedMegatronGPTInference: load_transformer_layer,
+        DeepSpeedOPTInference: load_transformer_layer,
+        OPTLearnedPositionalEmbedding: load,
+        OPTEmbedding: load
+    }
+
+    all_ds_ids = {}
+
+    def load_module_recursive(module, prefix='', level=0):
+        for name, child in module.named_children():
+            if child.__class__ in layer_policies:
+                checking_key = prefix + name + '.'
+                if not any(checking_key in item for item in sd[0].keys()):
+                    if hasattr(child, 'weight') and \
+                        (hasattr(child.weight, 'ds_id') and \
+                        child.weight.ds_id in all_ds_ids):
+                        prefix1 = all_ds_ids[child.weight.ds_id]
+                        if child.__class__ is nn.Linear:
+                            child = LinearLayer(weight=all_ds_ids[child.weight.ds_id])
+                            setattr(module, name, child)
+                    continue
+                child_params = list(child.parameters())
+                if len(child_params) > 0 and (child_params[0].numel() == 0
+                                              or child_params[0].is_meta):
+                    if child.weight.is_meta:
+                        ds_shape = child.weight.shape
+                    else:
+                        ds_shape = child.weight.ds_shape
+                    if child.__class__ is nn.LayerNorm:
+                        child = Normalize(dim=ds_shape[-1],
+                                          dtype=child.weight.dtype,
+                                          eps=child.eps)
+                        setattr(module, name, child)
+                    elif child.__class__ is nn.Linear:
+                        child = LinearLayer(weight_shape=child.weight.shape,
+                                            bias=child.bias)
+                        setattr(module, name, child)
+                    elif child.__class__ is OPTLearnedPositionalEmbedding:
+                        child = OPTEmbedding(weight_shape=ds_shape)
+                        setattr(module, name, child)
+                    else:
+                        ds_id = None
+                        if hasattr(child.weight, 'ds_id'):
+                            ds_id = child.weight.ds_id
+                        child = EmbeddingLayer(weight_shape=ds_shape,
+                                               dtype=child.weight.dtype)
+                        if ds_id is not None:
+                            all_ds_ids[ds_id] = child.weight
+                        setattr(module, name, child)
+                layer_policies[child.__class__](child, prefix + name + '.')
+            else:
+                load_module_recursive(
+                    child,
+                    prefix if (level == 0 and ckpt_type == 'pp') and container.policy.use_load_prefix else \
+                    prefix + name + '.',
+                    level + 1)
+
+    load_module_recursive(r_module)
+
+    embedding_weight = None
+
+    for n, p in r_module.named_parameters():
+        if "word_embeddings." in n or "embed_tokens." in n or "wte." in n:
+            embedding_weight = p
+    if embedding_weight is not None and r_module.lm_head.weight.is_meta:
+        r_module.lm_head.weight = embedding_weight
+    for sd_ in sd:
+        del sd_
+    sd = None
+    gc.collect()
diff --git a/deepspeed/module_inject/module_quantize.py b/deepspeed/module_inject/module_quantize.py
old mode 100644
new mode 100755
index fde6990eba280691a7c1ef37093d9973f0c8a074..4123a1214f7b29458ce2f0920f936653c46f04b5
--- a/deepspeed/module_inject/module_quantize.py
+++ b/deepspeed/module_inject/module_quantize.py
@@ -1,6 +1,6 @@
-import copy
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import torch
-import deepspeed
 
 
 def quantize_transformer_layer(orig_layer_impl, model, megatron=False, preln=False):
diff --git a/deepspeed/module_inject/policy.py b/deepspeed/module_inject/policy.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfd3343e12a34de23fe6660b68dac0257586d1d5
--- /dev/null
+++ b/deepspeed/module_inject/policy.py
@@ -0,0 +1,205 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+from abc import ABC, abstractmethod
+from deepspeed.utils.types import ActivationFuncType
+import torch
+from deepspeed.accelerator import get_accelerator
+
+transformer_param_names = (
+        'attn_qkvw', \
+        'attn_qkvb', \
+        'attn_ow' , \
+        'attn_ob', \
+        'inter_w', \
+        'inter_b', \
+        'output_w', \
+        'output_b', \
+        'attn_nw', \
+        'attn_nb', \
+        'norm_w', \
+        'norm_b')
+
+
+class DSPolicy(ABC):
+    _orig_layer_class = None
+
+    def __init__(self):
+        self.cuda_graph_supported = False
+
+    @abstractmethod
+    def attention(self):
+        """
+        Returns attention qkv and dense parameters
+        weight: (3*hidden, hidden) and (hidden, hidden)
+        bias: (3*hidden) and (hidden)
+        """
+        raise NotImplementedError
+
+
+class TransformerPolicy(DSPolicy):
+    # a static class variable containing the HuggingFace model configuration.
+    # see e.g., transformers.models.opt.configuration_opt.OPTConfig
+    hf_model_config = None
+
+    def __init__(
+            self,
+            inference=True,
+            linear_layer=True,
+            scale_attention=True,
+            megatron_v2=False,
+            use_mup=False,
+            # the type of activation function used in MLP
+            mlp_act_func_type=ActivationFuncType.GELU,
+            # applies layer norm before attention if `pre_attn_norm` is set to True
+            pre_attn_norm=True,
+            # this flag shows whether or not using prefix in loading the checkpoint
+            use_load_prefix=False,
+            # whether or not the qkv is stored in the split-format
+            split_qkv=True):
+        super().__init__()
+        self.cuda_graph_supported = False
+        self.inference = inference
+        self.linear_layer = linear_layer
+        self.scale_attention = scale_attention
+        self.is_megatron_v2 = megatron_v2
+        self.use_mup = use_mup
+        self.mlp_act_func_type = mlp_act_func_type
+        self.pre_attn_norm = pre_attn_norm
+        self.use_load_prefix = use_load_prefix
+        self.split_qkv = split_qkv
+
+    @abstractmethod
+    def attention(self):
+        """
+        Returns attention qkv and dense parameters
+        weight: (3*hidden, hidden) and (hidden, hidden)
+        bias: (3*hidden) and (hidden)
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_hidden_heads(self):
+        """
+        return hidden_size and number of heads
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def mlp(self):
+        """
+        Returns mlp intermediate and output
+        weight: (intermediate, hidden) and (hidden, intermediate)
+        bias: (intermediate) and (hidden)
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def layernorm(self):
+        """
+        Returns LayerNorms used in transformer layer
+        Post-Attention and pre/post layer norm
+        gamma and beta with shape: (hidden)
+        """
+        raise NotImplementedError
+
+
+# TODO (lekurile): This function exists in base container as well, consolidate as some point
+def transpose(data):
+    with torch.no_grad():
+        data = data.contiguous()
+        data1 = data.transpose(-1, -2).reshape(-1)
+        data.reshape(-1).copy_(data1)
+        data1 = None
+    return data.reshape(data.shape[-1], data.shape[-2])
+
+
+# TODO (lekurile): This function exists in megatron feature container as well, consolidate as some point
+def _transpose(x, heads=1, mp_replace=None):
+    heads = heads // mp_replace.mp_size
+    outer_dim = -1
+    attention_head_size = x.shape[outer_dim] // heads
+    new_x_shape = x.size()[:outer_dim] + (heads, attention_head_size)
+    x_1 = x.view(*new_x_shape)
+    (q, k, v) = torch.split(x_1, (x_1.shape[-1] // 3), dim=-1)
+    if len(q.shape) > 2:
+        new_shape = (q.shape[0], ) + (-1, )
+        return torch.cat((q.reshape(new_shape),
+                          k.reshape(new_shape),
+                          v.reshape(new_shape)),
+                         dim=outer_dim).reshape(x.shape)
+    else:
+        return torch.cat((q.reshape(-1),
+                          k.reshape(-1),
+                          v.reshape(-1)),
+                         dim=-1).reshape(x.shape)
+
+
+# This checks if the parameter exits in the checkpoint file and maybe copies it into the corresponding destination tensor.
+# Note that not all parameters are saved in one checkpoint, that's why we always need to check if they exist!
+def maybe_copy(module,
+               sd,
+               weight_quantizer,
+               mp_replace,
+               dst_name,
+               src_name,
+               qkv=False,
+               megatron_v2=False,
+               split_qkv=False,
+               heads=1):
+    if src_name in sd:
+        dst = getattr(module, dst_name)
+        tmp = sd[src_name]
+        if len(dst.shape) == 1:
+            if split_qkv:
+                dst = mp_replace.qkv_copy(dst, tmp)
+            else:
+                dst = mp_replace.copy(dst, tmp)
+            if qkv and megatron_v2:
+                dst = torch.nn.parameter.Parameter(
+                    _transpose(dst,
+                               heads=heads,
+                               mp_replace=mp_replace).contiguous())
+        else:
+            if split_qkv:
+                dst = mp_replace.qkv_copy(dst, weight_quantizer.quantize(tmp if weight_quantizer.q_int8 else \
+                                                (transpose(tmp).contiguous())), int8=weight_quantizer.q_int8)
+            else:
+                if qkv and megatron_v2:
+                    tmp = _transpose(transpose(tmp),
+                                     heads=heads,
+                                     mp_replace=mp_replace).contiguous()
+                    if weight_quantizer.q_int8:
+                        tmp = transpose(tmp)
+                dst = mp_replace.copy(dst, weight_quantizer.quantize(tmp if weight_quantizer.q_int8 else \
+                                                transpose(tmp)), int8=weight_quantizer.q_int8)
+        setattr(module, dst_name, dst)
+
+
+# Extending the maybe_copy function for when the q, k, and v are in separate parameters!
+def maybe_copy_qkv(module,
+                   sd,
+                   weight_quantizer,
+                   mp_replace,
+                   dst_name,
+                   src_names,
+                   split_qkv=False):
+    if src_names[0] in sd:
+        q = sd[src_names[0]]
+        k = sd[src_names[1]]
+        v = sd[src_names[2]]
+        qkv_data = torch.cat((q, k, v), dim=0)
+        dst = getattr(module, dst_name)
+        if len(dst.shape) == 1:
+            if split_qkv:
+                dst = mp_replace.qkv_copy(dst, qkv_data.contiguous())
+            else:
+                dst = mp_replace.copy(dst, qkv_data)
+        else:
+            if split_qkv:
+                dst = mp_replace.qkv_copy(dst, weight_quantizer.quantize(qkv_data.to(get_accelerator().device_name()) if weight_quantizer.q_int8 else \
+                                                ((transpose(qkv_data)).contiguous())), int8=weight_quantizer.q_int8)
+            else:
+                dst = mp_replace.copy(dst, weight_quantizer.quantize(qkv_data.to(get_accelerator().device_name()) if weight_quantizer.q_int8 else \
+                                                transpose(qkv_data)), int8=weight_quantizer.q_int8)
+        setattr(module, dst_name, dst)
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index 62d5cd75e9a1ca8413d06a78583eb529d358b006..c9032f9bb76453f23ea7c2677ef298b4f47bb6db 100644
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -1,49 +1,36 @@
-import copy
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
 import torch
+import tqdm
 import deepspeed
 import deepspeed.ops.transformer as transformer_inference
-from .replace_policy import HFBertLayerPolicy, HFGPT2LayerPolicy, HFGPTJLayerPolicy
-from .replace_policy import replace_policies
-from ..constants import INFERENCE_GENERIC_MODE, INFERENCE_SPECIALIZED_MODE
-from ..runtime.weight_quantizer import WeightQuantization
+from deepspeed.ops.transformer.inference.diffusers_attention import DeepSpeedDiffusersAttention
+from deepspeed.ops.transformer.inference.diffusers_transformer_block import DeepSpeedDiffusersTransformerBlock
+from deepspeed.ops.transformer.inference.diffusers_2d_transformer import Diffusers2DTransformerConfig
+from deepspeed.accelerator import get_accelerator
+from .replace_policy import HFGPT2LayerPolicy
+from .replace_policy import replace_policies, generic_policies
+
+from deepspeed import comm as dist
 from torch import nn
 
+from .layers import LinearAllreduce, LinearLayer
+from .load_checkpoint import load_model_with_checkpoint
+import time
 
-class LinearAllreduce(nn.Module):
-    def __init__(self, weight, bias=None, mp_group=None):
-        super(LinearAllreduce, self).__init__()
-        self.weight = weight
-        self.bias = bias
-        self.mp_group = mp_group
-
-    def forward(self, input):
-        output = torch.matmul(input, self.weight)
-        if self.mp_group is not None:
-            torch.distributed.all_reduce(output, group=self.mp_group)
-        if self.bias is not None:
-            output += self.bias
-        return output
-
-
-class LinearLayer(nn.Module):
-    def __init__(self, weight, bias=None):
-        super(LinearLayer, self).__init__()
-        self.weight = weight
-        self.bias = bias
-
-    def forward(self, input):
-        output = torch.matmul(input, self.weight)
-        if self.bias is not None:
-            output += self.bias
-        return output
+from .utils import policy_to_ds_container
 
 
 class ReplaceWithTensorSlicing:
-    def __init__(self, mp_group=None):
+    def __init__(self, mp_group=None, mp_size=1, out_dim=1, in_dim=0):
         if mp_group is not None:
-            self.gpu_index = torch.distributed.get_rank(group=mp_group)
+            self.gpu_index = dist.get_rank(group=mp_group)
         else:
             self.gpu_index = 0
+        self.out_dim = out_dim
+        self.in_dim = in_dim
+        self.mp_size = mp_size
 
     def merge_assert(self, dim1, dim2):
         assert dim1 > dim2, \
@@ -51,495 +38,415 @@ class ReplaceWithTensorSlicing:
             for merging your checkpoints before replacing the transformer layer with\
             inference-kernels'
 
-    def qkv_copy(self, dst, src):
+    def qkv_copy(self, dst, src, int8=False):
         if src is None:
-            return torch.nn.Parameter(src)
+            return src
         src_shape = src.shape
         dst_shape = dst.shape
 
-        src_split = torch.split(src.data, src.shape[-1] // 3, dim=-1)
+        outer_dim = 0 if int8 else -1
+        inner_dim = -1 if int8 else 0
 
+        src_split = torch.split(src.data, src.shape[outer_dim] // 3, dim=outer_dim)
         if (len(src_shape) == 2 and len(dst_shape) == 2):
-            if src_shape[1] == dst_shape[1]:
-                return torch.nn.Parameter(src)
-
-            self.merge_assert(src_shape[1], dst_shape[1])
-            qkv_size = dst_shape[1] // 3
-            qkv_split = [torch.split(src_s, qkv_size, dim=1) for src_s in src_split]
-
-            weight_split = [
-                torch.cat([qkv_s[i] for qkv_s in qkv_split],
-                          axis=1) for i in range(len(qkv_split[0]))
-            ]
-            dst.data.copy_(weight_split[self.gpu_index].to(
-                torch.cuda.current_device()).contiguous())
+            if src_shape[outer_dim] == dst_shape[self.out_dim]:
+                dst = dst.reshape(-1).data.copy_(src.data.reshape(-1)).reshape(src.shape)
+                dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
+                if hasattr(src, 'scale'):
+                    dst.scale = src.scale
+                return dst
+            if self.out_dim == 1:
+                self.merge_assert(src_shape[outer_dim], dst_shape[self.out_dim])
+                qkv_size = dst_shape[self.out_dim] // 3
+                qkv_split = [
+                    torch.split(src_s,
+                                qkv_size,
+                                dim=outer_dim) for src_s in src_split
+                ]
+
+                weight_split = [
+                    torch.cat([qkv_s[i] for qkv_s in qkv_split],
+                              axis=outer_dim) for i in range(len(qkv_split[0]))
+                ]
+                dst = dst.reshape(-1).data.copy_(
+                    weight_split[self.gpu_index].contiguous().reshape(-1)).reshape(
+                        weight_split[self.gpu_index].shape)
+            else:
+                dst.data.copy_(src_split[self.gpu_index].to(
+                    get_accelerator().current_device_name()).contiguous())
         else:
             if src_shape[0] == dst_shape[0]:
-                return torch.nn.Parameter(src)
-
-            qkv_size = dst_shape[0] // 3
-            qkv_split = [torch.split(src_s, qkv_size, dim=0) for src_s in src_split]
-            bias_split = [
-                torch.cat([qkv_s[i] for qkv_s in qkv_split],
-                          axis=0) for i in range(len(qkv_split[0]))
-            ]
-            dst.data.copy_(bias_split[self.gpu_index].to(
-                torch.cuda.current_device()).contiguous())
+                return torch.nn.parameter.Parameter(src)
+            if self.out_dim == 1:
+                qkv_size = dst_shape[0] // 3
+                qkv_split = [torch.split(src_s, qkv_size, dim=0) for src_s in src_split]
+                bias_split = [
+                    torch.cat([qkv_s[i] for qkv_s in qkv_split],
+                              axis=0) for i in range(len(qkv_split[0]))
+                ]
+                dst.data.copy_(bias_split[self.gpu_index].contiguous())
+            else:
+                dst.data.copy_(src_split[self.gpu_index].contiguous())
 
-        return torch.nn.Parameter(dst)
+        dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
+        if hasattr(src, 'scale'):
+            dst.scale = src.scale
+        return dst
 
-    def copy(self, dst, src):
+    def copy(self, dst, src, int8=False):
         if src is None:
-            return torch.nn.Parameter(src)
-
+            return src
+        assert not dst.data.is_meta  # the torch.Tensor.copy_ method used below will silently fail on meta tensors
+        outer_dim = 0 if int8 else 1
+        inner_dim = 1 if int8 else 0
         src_shape = src.shape
         dst_shape = dst.shape
-
         if (len(src_shape) == 2 and len(dst_shape) == 2):
 
-            if src_shape[0] == dst_shape[0] and src_shape[1] == dst_shape[1]:
-                return torch.nn.Parameter(src)
-
-            if src_shape[0] != dst_shape[0]:
-                self.merge_assert(src_shape[0], dst_shape[0])
-                weight_split = torch.split(src, dst_shape[0])
+            if src_shape[inner_dim] == dst_shape[
+                    self.in_dim] and src_shape[outer_dim] == dst_shape[self.out_dim]:
+                dst = dst.reshape(-1).data.copy_(src.data.reshape(-1)).reshape(src.shape)
             else:
-                self.merge_assert(src_shape[1], dst_shape[1])
-                weight_split = torch.split(src.data, dst_shape[1], dim=1)
-
-            dst.data.copy_(weight_split[self.gpu_index].to(
-                torch.cuda.current_device()).contiguous())
+                if src_shape[inner_dim] != dst_shape[self.in_dim]:
+                    self.merge_assert(src_shape[inner_dim], dst_shape[self.in_dim])
+                    weight_split = torch.split(
+                        src,
+                        dst_shape[self.in_dim],
+                        dim=inner_dim)[self.gpu_index].contiguous()
+                else:
+                    self.merge_assert(src_shape[outer_dim], dst_shape[self.out_dim])
+                    weight_split = torch.split(
+                        src.data,
+                        dst_shape[self.out_dim],
+                        dim=outer_dim)[self.gpu_index].contiguous()
+                dst = dst.reshape(-1).data.copy_(weight_split.reshape(-1)).reshape(
+                    weight_split.shape)
         else:
             if src_shape[0] == dst_shape[0]:
-                return torch.nn.Parameter(src)
-
-            bias_split = torch.split(src.data, dst_shape[-1])
-            dst.data.copy_(bias_split[self.gpu_index].to(
-                torch.cuda.current_device()).contiguous())
-
-        return torch.nn.Parameter(dst)
+                dst.data.copy_(src)
+            else:
+                bias_split = torch.split(src.data,
+                                         dst_shape[-1])[self.gpu_index].contiguous()
+                dst.data.copy_(bias_split)
+        dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
+        if hasattr(src, 'scale'):
+            dst.scale = src.scale
+        return dst
+
+
+def get_transformer_name(replaced_module):
+    from .containers import supported_models
+    from torch.nn import ModuleList
+    transformer_name = ''
+    for n, c in replaced_module.named_children():
+        if c.__class__ in supported_models:
+            transformer_name += n + '.'
+            for name, child in c.named_children():
+                if child.__class__ is ModuleList:
+                    transformer_name += name
+                    break
+            break
+    return transformer_name
+
+
+class GroupQuantizer:
+    def __init__(self, q_int8=True, group_size=1, num_bits=8, num_groups=0):
+        self.group_size = group_size
+        self.num_bits = num_bits
+        self.q_int8 = q_int8
+
+        self.num_groups = num_groups
+
+    def quantize(self, inputs, qkv=True, count=1, parallel_dim=0):
+        if not self.q_int8 or not qkv:
+            inputs = torch.nn.Parameter(inputs, requires_grad=False)
+            inputs.scale = torch.empty(1)
+            return inputs
+        q_range = 2**self.num_bits
+        num_groups = self.num_groups if self.num_groups > 0 else inputs.shape[
+            0] // self.group_size
+        inputs = inputs.to(get_accelerator().current_device_name())
+        input_flat = inputs.reshape(num_groups, -1).contiguous()
+        input_min = torch.min(input_flat, dim=1, keepdim=True)[0].float()
+        input_max = torch.max(input_flat, dim=1, keepdim=True)[0].float()
+        scale = torch.max(input_min.abs(), input_max.abs()) * 2.0 / (q_range)
+        input_flat = (input_flat / scale).round().clamp(-q_range // 2, q_range // 2 - 1)
+        inputs_q = input_flat.reshape(inputs.shape).to(torch.int8).contiguous()
+        out = torch.nn.Parameter(inputs_q, requires_grad=False)
+        inputs_split = inputs.split(inputs.shape[parallel_dim] // 2, dim=parallel_dim)
+        input_flat = [
+            inputs_split[i].reshape(num_groups,
+                                    -1).contiguous() for i in range(2)
+        ]
+        input_min = [
+            torch.min(input_flat[i],
+                      dim=1,
+                      keepdim=True)[0].float() for i in range(2)
+        ]
+        input_max = [
+            torch.max(input_flat[i],
+                      dim=1,
+                      keepdim=True)[0].float() for i in range(2)
+        ]
+        scale1 = [
+            (torch.max(input_min[i].abs(),
+                       input_max[i].abs()) * 2.0 / (q_range)).squeeze().unsqueeze(0)
+            for i in range(2)
+        ]
+
+        out.scale = torch.cat([scale.squeeze().unsqueeze(0),
+                               scale1[0],
+                               scale1[1]],
+                              dim=0).reshape(num_groups,
+                                             -1).contiguous()
+        return out
+
+
+def _module_match(module):
+    for policy in generic_policies:
+        policy = policy()
+        if policy.match(module):
+            return policy
+    return None
+
+
+def generic_injection(module, fp16=False, enable_cuda_graph=True):
+    def replace_attn(child, policy):
+        policy_attn = policy.attention(child)
+        if policy_attn is None:
+            return child
+        if len(policy_attn) == 5:
+            qkvw, attn_ow, attn_ob, hidden_size, heads = policy_attn
+        else:
+            qw, kw, vw, attn_ow, attn_ob, hidden_size, heads = policy_attn
+
+        config = transformer_inference.DeepSpeedInferenceConfig(
+            hidden_size=hidden_size,
+            heads=heads,
+            fp16=fp16,
+            triangular_masking=False,
+            max_out_tokens=4096,
+        )
+        attn_module = DeepSpeedDiffusersAttention(config)
+
+        def transpose(data):
+            data = data.contiguous()
+            data.reshape(-1).copy_(data.transpose(-1, -2).contiguous().reshape(-1))
+            data = data.reshape(data.shape[-1], data.shape[-2])
+            data.to(get_accelerator().current_device_name())
+            return data
+
+        if len(policy_attn) == 5:
+            attn_module.attn_qkvw.data = transpose(qkvw.data)
+        else:
+            attn_module.attn_qkvw = None
+            attn_module.attn_qw.data = transpose(qw.data)
+            attn_module.attn_kw.data = transpose(kw.data)
+            attn_module.attn_vw.data = transpose(vw.data)
+
+        attn_module.attn_qkvb = None
+        attn_module.attn_ow.data = transpose(attn_ow.data)
+        attn_module.attn_ob.data.copy_(
+            attn_ob.data.to(get_accelerator().current_device_name()))
+        return attn_module
+
+    def replace_attn_block(child, policy):
+        config = Diffusers2DTransformerConfig()
+        return DeepSpeedDiffusersTransformerBlock(child, config)
+
+    if isinstance(module, torch.nn.Module):
+        pass
+    else:
+        if fp16 is False:
+            raise ValueError("Generic injection only supported with FP16")
+
+        try:
+            import diffusers
+            cross_attention = diffusers.models.attention.CrossAttention
+            attention_block = diffusers.models.attention.BasicTransformerBlock
+            new_policies = {
+                cross_attention: replace_attn,
+                attention_block: replace_attn_block,
+            }
+        except ImportError:
+            new_policies = {}
+
+        #replace_transformer_layer(None,
+        #                          module.text_encoder,
+        #                          training=False,
+        #                          replace_with_kernel_inject=True,
+        #                          triangular_masking=True,
+        #                          max_out_tokens=8192)
+        from ..model_implementations.transformers.clip_encoder import DSClipEncoder
+        cg_encoder = DSClipEncoder(module.text_encoder,
+                                   enable_cuda_graph=enable_cuda_graph)
+        setattr(module, 'text_encoder', cg_encoder)
+        for name in module.__dict__.keys():
+            sub_module = getattr(module, name)
+            policy = _module_match(sub_module)
+
+            if policy is not None:
+
+                def _replace_module(module, policy):
+                    for name, child in module.named_children():
+                        _replace_module(child, policy)
+                        if child.__class__ in new_policies:
+                            replaced_module = new_policies[child.__class__](child,
+                                                                            policy)
+                            setattr(module, name, replaced_module)
+
+                _replace_module(sub_module, policy)
+                new_module = policy.apply(sub_module,
+                                          enable_cuda_graph=enable_cuda_graph)
+                print(f"**** found and replaced {name} w. {type(new_module)}")
+                setattr(module, name, new_module)
+
+
+container_g = None
 
 
 def replace_transformer_layer(orig_layer_impl,
                               model,
-                              policy=None,
-                              micro_batch_size=-1,
-                              config=None,
-                              seed=-1,
-                              hidden_size=-1,
-                              num_attention_heads=-1,
-                              mp_size=1,
-                              training_mp_size=1,
-                              mp_group=None,
-                              ep_group=None,
-                              expert_mp_group=None,
-                              preln=True,
-                              fp16=True,
-                              local_rank=-1,
-                              stochastic_mode=True,
-                              training=True,
-                              quantize=False,
-                              quantize_settings=None,
-                              triangular_masking=False,
-                              return_tuple=True,
-                              replace_with_kernel_inject=False,
-                              linear_layer_setting=None,
-                              moe=False,
-                              moe_experts=1,
-                              moe_type='standard'):
+                              checkpoint_dict,
+                              config,
+                              model_config):
     """ Replace bert-style transformer layers with DeepSpeed's transformer layer
     Arguments:
         orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for,
             e.g., transformers.modeling_bert.BertLayer.
         model (torch.nn.Module): user's nn.module representing their model
-        policy: shows the policy for mapping from the orig_layer_impl to transformer parameters when
-            replace_with_kernel_inject is set, otherwise, it provides the names of two linear layers as
-            a tuple: (attention_output projection, transformer output projection)
-        micro_batch_size (int): micro batch size per gpu used during training/eval
-        config (dict): model config containing hidden size, attention heads, etc.
-        seed (int): random seed value
-        max_seq_length (int): max sequence length for training
-        hidden_size (int): hidden dimension
-        num_attention_heads (int): number of attention heads
-        mp_size (int): model_parallelism degree
-        mp_group : model_parallel group initialized on the modeling side
-        preln (bool): does the original layer implementation do pre or post layer norm?
-        fp16 (bool): fp16 or fp32
-        local_rank (int): GPU rank (optional),
-        stochastic_mode (bool): whether to use stochastic mode
-        training (bool): specifying whether kernel-injection is done for training/inference (set to false for inference-mode injection)
-        quantize_settings (tuple): this setting shows how we can quantize a model for running it through the inference kernels.
-                It includes (quantization_scales, merge_count, mlp_extra_grouping, quantize_groups).
-        return_tuple (bool): if set, transformer layer returns a tuple as the output.
-            Note: this flag needs to be set for huggingface models.
-        replace_with_kernel_inject (bool): injection_mode, if true, kernels will be add along with configuring
-            Tensor-Parallelism
-        linear_layer_setting (tuple of modules) [Optional]: shows which two classes are used for linear layers
-            and embedding layers
-        attention_params: (list of strings) [Optional]: shows the parameters in the attention part that needs to
-            be adjusted based on the model-parallelism
+        checkpoint_dict: Dictionary for checkpoint passed from the Inference Engine
+        config: top-level DS Inference config defined in inference/config.py
+        model_config: HuggingFace model config passed from the inference/engine.py
     Returns:
         Updated nn.module with replaced transformer layers
     """
+    # defining globals as internally defined functions inherit these everywhere
+    fp16 = (config.dtype == torch.float16 or config.dtype == torch.int8)
+    quantize = (config.dtype == torch.int8)
+    # todo: Refactor later. In future, let's minimize the style used above and use config.** instead
+
+    linear_layer_setting = None
+    '''
+        linear_layer_setting (tuple of modules) [Optional]: shows which two classes are used for linear layers and embedding layers
+    '''
+    micro_batch_size = -1
+    seed = -1
+    local_rank = -1
+
+    mp_replace = ReplaceWithTensorSlicing(
+        mp_group=config.tensor_parallel.tp_group,
+        mp_size=config.tensor_parallel.tp_size)  #, out_dim=0, in_dim=1)
+
     def replace_with_policy(child,
                             policy_cls,
                             triangular_masking,
                             inference=False,
-                            preln=True,
                             layer_id=0):
-        preln = False if policy_cls is HFBertLayerPolicy else preln
-        if policy_cls is HFBertLayerPolicy:
-            policy = policy_cls(child, inference=inference, preln=preln)
-        else:
-            policy = policy_cls(child, inference=inference)
+        policy = policy_cls(child, inference=inference)
+        if not policy.cuda_graph_supported:
+            # policy says cuda graph is not supported raise an error if set
+            assert not config.enable_cuda_graph, "cuda graph is not supported with this model, please disable"
 
-        if inference:
-            hidden_size, num_attention_heads = policy.get_hidden_heads()
-            assert num_attention_heads % mp_size == 0,\
-                "To run the model parallel across the GPUs, the attention_heads require to be divisible by the world_size!" +\
-                "This is because the attention computation is partitioned evenly among the parallel GPUs."
         from deepspeed.moe.layer import MoE
         moe = False
         if hasattr(child, 'mlp') and isinstance(child.mlp, MoE):
             num_experts = child.mlp.num_experts
             moe = True
 
-        attn_linear_layer, qkvw, qkvb, dense_w, dense_b, scale_attention, megatron_v2 = policy.attention()
-        if not moe or moe_type == 'standard':
-            mlp_linear_layer, _h4h_w, _h4h_b, _4hh_w, _4hh_b = policy.mlp()
-        else:
-            mlp_linear_layer, _h4h_w, _h4h_b, _4hh_w, _4hh_b, \
-                _res_h4h_w, _res_h4h_b, _res_4hh_w, _res_4hh_b, _res_coef = policy.mlp(moe_type)
-
-        attn_nw, attn_nb, input_nw, input_nb = policy.layerNorm()
-        if quantize:
-            if policy_cls is not HFBertLayerPolicy:
-                qkvw = qkvw.to(torch.int8)
-            dense_w = dense_w.to(torch.int8)
-            _h4h_w = [moe_w1.to(torch.int8)
-                      for moe_w1 in _h4h_w] if moe else _h4h_w.to(torch.int8)
-            _4hh_w = [moe_w1.to(torch.int8)
-                      for moe_w1 in _4hh_w] if moe else _4hh_w.to(torch.int8)
-        elif fp16:
-            qkvw = qkvw.half()
-            dense_w = dense_w.half()
-            _h4h_w = [moe_w1.half() for moe_w1 in _h4h_w] if moe else _h4h_w.half()
-            _4hh_w = [moe_w1.half() for moe_w1 in _4hh_w] if moe else _4hh_w.half()
-        if quantize or fp16:
-            qkvb = qkvb if qkvb is None else qkvb.half()
-            dense_b = dense_b if dense_b is None else dense_b.half()
-            _h4h_b = [moe_b1.half() for moe_b1 in _h4h_b] if moe else _h4h_b.half()
-            _4hh_b = [moe_b1.half() for moe_b1 in _4hh_b] if moe else _4hh_b.half()
-            attn_nw = attn_nw if attn_nw is None else attn_nw.half()
-            attn_nb = attn_nb if attn_nb is None else attn_nb.half()
-            input_nw = input_nw.half()
-            input_nb = input_nb.half()
-
-        if moe and moe_type == 'residual' and fp16:
-            _res_h4h_b = _res_h4h_b.half()
-            _res_4hh_b = _res_4hh_b.half()
-            _res_h4h_w = _res_h4h_w.half()
-            _res_4hh_w = _res_4hh_w.half()
-            _res_coef = _res_coef.half()
-
-        mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group)
-        #expert_mp_replace = ReplaceWithTensorSlicing(mp_group=expert_mp_group)
-
-        if inference:
-            if moe:
-                ep_world_size = torch.distributed.get_world_size()
-                local_ep_size = 1 if num_experts < ep_world_size else num_experts // ep_world_size
-
-                transformer_config = transformer_inference.DeepSpeedMoEInferenceConfig(
-                    hidden_size=hidden_size,
-                    heads=num_attention_heads,
-                    layer_norm_eps=config.layer_norm_eps if hasattr(
-                        config,
-                        'layer_norm_eps') else 1e-12,
-                    fp16=fp16,
-                    pre_layer_norm=preln,
-                    mp_size=mp_size,
-                    q_int8=quantize,
-                    moe_experts=local_ep_size,
-                    global_experts=num_experts,
-                    mlp_type=moe_type)
-            else:
-                rotary_dim = config.rotary_dim if hasattr(config, 'rotary_dim') else child.attention.rotary_ndims \
-                                            if hasattr(child, 'attention') and hasattr(child.attention,'rotary_ndims') else -1
-                transformer_config = transformer_inference.DeepSpeedInferenceConfig(
-                    hidden_size=hidden_size,
-                    heads=num_attention_heads,
-                    layer_norm_eps=config.layer_norm_eps if hasattr(
-                        config,
-                        'layer_norm_eps') else
-                    (config.layer_norm_epsilon
-                     if hasattr(config,
-                                'layer_norm_epsilon') else config.layernorm_epsilon
-                     if hasattr(config,
-                                'layernorm_epsilon') else 1.0e-12),
-                    fp16=fp16,
-                    pre_layer_norm=preln,
-                    mp_size=mp_size,
-                    q_int8=quantize,
-                    return_tuple=(return_tuple or (policy_cls is HFBertLayerPolicy)),
-                    triangular_masking=(policy_cls is not HFBertLayerPolicy),
-                    local_attention=((config.attention_layers[layer_id] == "local")
-                                     if hasattr(config,
-                                                'attention_layers') else False),
-                    window_size=(config.window_size if hasattr(config,
-                                                               'window_size') else 1),
-                    rotary_dim=rotary_dim,
-                    mlp_after_attn=(rotary_dim is None or rotary_dim < 0),
-                    training_mp_size=training_mp_size)
-
-            if quantize and quantize_settings is not None:
-                (quantization_scales,
-                 merge_count,
-                 mlp_extra_grouping,
-                 quantize_groups) = quantize_settings
-                if moe:
-                    new_module = transformer_inference.DeepSpeedMoEInference(
-                        transformer_config,
-                        mp_group=mp_group,
-                        ep_group=None if ep_group is None else ep_group[num_experts],
-                        expert_mp_group=None
-                        if expert_mp_group is None else expert_mp_group[num_experts],
-                        quantize_scales=quantization_scales[layer_id],
-                        quantize_groups=quantize_groups,
-                        merge_count=merge_count,
-                        mlp_extra_grouping=mlp_extra_grouping,
-                        qkv_merging=(policy_cls is HFBertLayerPolicy))
+        # 1. Create a model-specific container object using the policy object.
+        _container = policy_to_ds_container(policy=policy,
+                                            config=config,
+                                            model_config=model_config,
+                                            layer_id=layer_id,
+                                            child=child)
+        _container.set_dtype(fp16)
+        _container.set_moe(moe)
 
-                else:
-                    new_module = transformer_inference.DeepSpeedTransformerInference(
-                        transformer_config,
-                        mp_group=mp_group,
-                        quantize_scales=quantization_scales[layer_id],
-                        quantize_groups=quantize_groups,
-                        merge_count=merge_count,
-                        mlp_extra_grouping=mlp_extra_grouping,
-                        qkv_merging=(policy_cls is HFBertLayerPolicy))
-
-                if quantize and qkvw.dtype != torch.int8:
-                    quantize_bits = 8
-                    quantizer = WeightQuantization()
-                    if policy_cls is HFBertLayerPolicy:
-                        data_quantized, _ = quantizer.quantize_data(qkvw.data, quantize_bits, quantize_groups * 3)
-                    else:
-                        data_quantized, _ = quantizer.quantize_data(qkvw.data, quantize_bits, quantize_groups)
-                    qkvw.data.copy_(data_quantized)
-                    qkvw.data = qkvw.data.to(torch.int8)
-            else:
+        # 2. Set the tensor parallelism config
+        _container.set_tensor_parallel_config(config.tensor_parallel.tp_size,
+                                              config.tensor_parallel.tp_group)
 
-                if moe:
-                    new_module = transformer_inference.DeepSpeedMoEInference(
-                        transformer_config,
-                        mp_group=mp_group,
-                        ep_group=None if ep_group is None else ep_group[num_experts],
-                        expert_mp_group=None
-                        if expert_mp_group is None else expert_mp_group[num_experts],
-                    )
+        # 3. Initialize tensors
+        _container.initialize_tensors()
 
-                else:
-                    new_module = transformer_inference.DeepSpeedTransformerInference(
-                        transformer_config,
-                        mp_group=mp_group,
-                    )
-            new_module.config.scale_attention = scale_attention
-
-            # we want the weights in [input, output] shape
-            # linear layer is created with [input, output] shape
-            # transpose it here to reduce inference cost!
-            def transpose(data):
-                data.view(-1).copy_(data.transpose(-1, -2).contiguous().view(-1))
-                data = data.reshape(data.shape[-1], data.shape[-2])
-                return data
-
-            if attn_linear_layer:
-                qkvw.data = transpose(qkvw.data)
-                dense_w.data = transpose(dense_w.data)
-
-            if megatron_v2:
-                new_module.config.rotate_half = True
-                new_module.config.rotate_every_two = False
-
-                def _transpose(x):
-                    num_attention_heads_per_partition = transformer_config.heads // transformer_config.mp_size
-                    attention_head_size = x.shape[-1] // num_attention_heads_per_partition
-                    new_x_shape = x.size()[:-1] + (num_attention_heads_per_partition,
-                                                   attention_head_size)
-                    x_1 = x.view(*new_x_shape)
-                    (q,
-                     k,
-                     v) = torch.split(x_1,
-                                      (x_1.shape[-1] // 3),
-                                      dim=(x_1.dim() - 1))
-                    if len(q.shape) > 2:
-                        return torch.cat((q.reshape(q.shape[0],
-                                                    -1),
-                                          k.reshape(q.shape[0],
-                                                    -1),
-                                          v.reshape(q.shape[0],
-                                                    -1)),
-                                         dim=-1).reshape(x.shape)
-                    else:
-                        return torch.cat((q.reshape(-1),
-                                          k.reshape(-1),
-                                          v.reshape(-1)),
-                                         dim=-1).reshape(x.shape)
-
-                qkvw = torch.nn.Parameter(_transpose(qkvw).contiguous())
-                qkvb = torch.nn.Parameter(_transpose(qkvb).contiguous())
-
-            dense_b = dense_b * (transformer_config.training_mp_size /
-                                 transformer_config.mp_size)
-            _4hh_b = _4hh_b * (transformer_config.training_mp_size /
-                               transformer_config.mp_size)
-
-            if mlp_linear_layer:
-                _h4h_w = [transpose(moe_w1.data)
-                          for moe_w1 in _h4h_w] if moe else transpose(_h4h_w.data)
-                _4hh_w = [transpose(moe_w1.data)
-                          for moe_w1 in _4hh_w] if moe else transpose(_4hh_w.data)
-
-            if moe and moe_type == 'residual':
-                _res_h4h_w.data = transpose(_res_h4h_w.data)
-                _res_4hh_w.data = transpose(_res_4hh_w.data)
-                _res_coef.data = transpose(_res_coef.data)
-
-            attn_block = new_module.attention
-            attn_block.attn_qkvw = mp_replace.qkv_copy(attn_block.attn_qkvw, qkvw)
-            attn_block.attn_qkvb = mp_replace.qkv_copy(attn_block.attn_qkvb, qkvb)
-
-            attn_block.attn_ow = mp_replace.copy(attn_block.attn_ow, dense_w)
-            attn_block.attn_ob = mp_replace.copy(attn_block.attn_ob, dense_b)
-
-            mpl_block = new_module.mlp
-            if moe:
-                gpu_index = torch.distributed.get_rank()
-                gpu_index = 0
-                for ep_index in range(local_ep_size):
-                    mpl_block[ep_index].inter_w.data = _h4h_w[
-                        gpu_index * local_ep_size + ep_index].to(
-                            torch.cuda.current_device())
-                    mpl_block[ep_index].inter_b.data = _h4h_b[
-                        gpu_index * local_ep_size + ep_index].to(
-                            torch.cuda.current_device())
-                    mpl_block[ep_index].output_w.data = _4hh_w[
-                        gpu_index * local_ep_size + ep_index].to(
-                            torch.cuda.current_device())
-                    mpl_block[ep_index].output_b.data = _4hh_b[
-                        gpu_index * local_ep_size + ep_index].to(
-                            torch.cuda.current_device())
-                new_module.attn_nw.data = attn_nw.to(torch.cuda.current_device())
-                new_module.attn_nb.data = attn_nb.to(torch.cuda.current_device())
-                if moe_type == 'residual':
-                    new_module.res_mlp.inter_w.data = _res_h4h_w.to(
-                        torch.cuda.current_device())
-                    new_module.res_mlp.inter_b.data = _res_h4h_b.to(
-                        torch.cuda.current_device())
-                    new_module.res_mlp.output_w.data = _res_4hh_w.to(
-                        torch.cuda.current_device())
-                    new_module.res_mlp.output_b.data = _res_4hh_b.to(
-                        torch.cuda.current_device())
-                    new_module.res_coef.data = _res_coef.to(torch.cuda.current_device())
-            else:
-                mpl_block.inter_w.data = mp_replace.copy(mpl_block.inter_w, _h4h_w)
-                mpl_block.inter_b.data = mp_replace.copy(mpl_block.inter_b, _h4h_b)
-                mpl_block.output_w.data = mp_replace.copy(mpl_block.output_w, _4hh_w)
-                mpl_block.output_b.data = mp_replace.copy(mpl_block.output_b, _4hh_b)
-                if attn_nw is None:
-                    new_module.mlp.attn_nw = attn_nw
-                else:
-                    new_module.mlp.attn_nw.data = attn_nw.to(torch.cuda.current_device())
-                if attn_nb is None:
-                    new_module.mlp.attn_nb = attn_nb
-                else:
-                    new_module.mlp.attn_nb.data = attn_nb.to(torch.cuda.current_device())
-            new_module.norm_w.data = input_nw.to(torch.cuda.current_device())
-            new_module.norm_b.data = input_nb.to(torch.cuda.current_device())
-        else:
-            transformer_config = deepspeed.DeepSpeedTransformerConfig(
-                batch_size=micro_batch_size,
-                hidden_size=config.hidden_size,
-                heads=config.num_attention_heads,
-                attn_dropout_ratio=config.attention_probs_dropout_prob,
-                hidden_dropout_ratio=config.hidden_dropout_prob,
-                num_hidden_layers=config.num_hidden_layers,
-                initializer_range=config.initializer_range,
-                layer_norm_eps=config.layer_norm_eps if hasattr(
-                    config,
-                    'layer_norm_eps') else 1e-12,
-                seed=seed,
-                fp16=fp16,
-                pre_layer_norm=(False if policy_cls is HFBertLayerPolicy else preln),
-                return_tuple=return_tuple,
-                local_rank=local_rank,
-                stochastic_mode=stochastic_mode,
-                normalize_invertible=True,
-                training=training)
-            new_module = deepspeed.DeepSpeedTransformerLayer(transformer_config)
-            new_module.attn_qkvw.data = qkvw
-            new_module.attn_qkvb.data = qkvb
-            new_module.attn_ow.data = dense_w
-            new_module.attn_ob.data = dense_b
-
-            new_module.attn_nw.data = attn_nw
-            new_module.attn_nb.data = attn_nb
-            new_module.norm_w.data = input_nw
-            new_module.norm_b.data = input_nb
-
-            new_module.inter_w.data = _h4h_w
-            new_module.inter_b.data = _h4h_b
-            new_module.output_w.data = _4hh_w
-            new_module.output_b.data = _4hh_b
-        return new_module
+        # 4. deal with data types -- needs refactor to use dtype instead of fp16
+        if fp16:
+            _container.convert_to_required_dtype(dtype=torch.half)
+
+        # 5. Set the quantization config
+        quantizer = GroupQuantizer(q_int8=quantize)
+        _container.set_quantization_config(quantize, quantizer)
+
+        # 6. create a DS Inference config object
+        _container.create_ds_model_config()
+
+        # 7. use the config and create the module
+        _container.create_module()
+
+        # 8. transpose the weights and bias if needed
+        _container.transpose()
+
+        # 9. deal with tensor parallelism.
+        _container.apply_tensor_parallelism(mp_replace)
+
+        # 10. copy the tensors from the model-specific container to the new module
+        _container.copy_data_to_new_module()
+
+        # 11. set global for generic checkpoint loading
+        global container_g
+
+        if container_g is None:
+            container_g = _container
+
+        return _container.module
 
     def replace_wo_policy(module, all_reduce_linears):
+        mp_size = config.tensor_parallel.tp_size
+        mp_group = config.tensor_parallel.tp_group
+
         def _replace(child, name, conv_linear_layer):
             mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group)
+            weight_shape = child.weight.shape
             if name in all_reduce_linears:
-                new_weight = torch.empty(
-                    (child.weight.shape[0]
-                     if conv_linear_layer else child.weight.shape[1] // mp_size,
-                     child.weight.shape[1]
-                     if conv_linear_layer else child.weight.shape[0]),
-                    device=child.weight.device,
-                    dtype=torch.half if fp16 else torch.float)
-                if not conv_linear_layer:
-                    child.weight.data.view(-1).copy_(
-                        child.weight.data.transpose(-1,
-                                                    -2).contiguous().view(-1))
-                    child.weight.data = child.weight.data.reshape(
-                        child.weight.data.shape[-1],
-                        child.weight.data.shape[-2])
-                data = mp_replace.copy(new_weight,
-                                       child.weight.data).to(torch.cuda.current_device())
+                new_weight = torch.empty((
+                    weight_shape[1] if conv_linear_layer else weight_shape[0],
+                    (weight_shape[0] if conv_linear_layer else weight_shape[1]) //
+                    mp_size,
+                ),
+                                         device=child.weight.device,
+                                         dtype=child.weight.dtype)
+                if conv_linear_layer:
+                    child.weight.data = child.weight.data.transpose(-1, -2).contiguous()
+                data = mp_replace.copy(new_weight, child.weight.data)
+                new_bias = torch.empty((weight_shape[0]),
+                                       device=child.weight.device,
+                                       dtype=child.weight.dtype)
+                if child.bias is not None:
+                    new_bias.data.copy_(child.bias.data)
                 return LinearAllreduce(data, child.bias if child.bias is None else \
-                            child.bias.to(torch.cuda.current_device()), mp_group)
+                            torch.nn.parameter.Parameter(new_bias.to(get_accelerator().current_device_name())), mp_group)
             else:
-                new_weight = torch.empty(
-                    (child.weight.shape[0] //
-                     mp_size if conv_linear_layer else child.weight.shape[1],
-                     child.weight.shape[1]
-                     if conv_linear_layer else child.weight.shape[0] // mp_size),
-                    device=child.weight.device,
-                    dtype=torch.half if fp16 else torch.float)
-                if not conv_linear_layer:
-                    child.weight.data.view(-1).copy_(
-                        child.weight.data.transpose(-1,
-                                                    -2).contiguous().view(-1))
-                    child.weight.data = child.weight.data.reshape(
-                        child.weight.data.shape[-1],
-                        child.weight.data.shape[-2])
+                new_weight = torch.empty((
+                    (weight_shape[1] if conv_linear_layer else weight_shape[0]) //
+                    mp_size,
+                    weight_shape[0] // mp_size if conv_linear_layer else weight_shape[1],
+                ),
+                                         device=child.weight.device,
+                                         dtype=child.weight.dtype)
+                if conv_linear_layer:
+                    child.weight.data = child.weight.data.transpose(-1, -2).contiguous()
                 data = mp_replace.copy(new_weight, child.weight.data)
-                new_bias = torch.empty((child.weight.shape[1] // mp_size),
+
+                new_bias = torch.empty((weight_shape[0] // mp_size),
                                        device=child.weight.device,
-                                       dtype=torch.half if fp16 else torch.float)
+                                       dtype=child.weight.dtype)
                 bias_data = None if child.bias is None else mp_replace.copy(
                     new_bias,
-                    child.bias.data).to(torch.cuda.current_device())
-                return LinearLayer(data.to(torch.cuda.current_device()), bias_data)
+                    child.bias.data).to(get_accelerator().current_device_name())
+                return LinearLayer(weight=data.to(
+                    get_accelerator().current_device_name()),
+                                   bias=bias_data)
 
         def _slice_embedding(child, name, conv_linear_layer):
             mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group)
@@ -547,7 +454,9 @@ def replace_transformer_layer(orig_layer_impl,
                                       child.weight.shape[1] // mp_size),
                                      device=child.weight.device,
                                      dtype=child.weight.dtype)
-            data = mp_replace.copy(new_weight, child.weight.data)
+            data = mp_replace.copy(new_weight,
+                                   child.weight.ds_tensor.data if hasattr(child.weight, 'ds_tensor') else \
+                                   child.weight.data)
             new_embedding = nn.Embedding(child.weight.shape[0],
                                          child.weight.shape[1] // mp_size)
             new_embedding.weight.data.copy_(data)
@@ -562,10 +471,14 @@ def replace_transformer_layer(orig_layer_impl,
                 child.num_heads = child.num_heads // mp_size
             if hasattr(child, 'num_attention_heads'):
                 child.num_attention_heads = child.num_attention_heads // mp_size
+            if hasattr(child, 'num_attn_heads'):
+                child.num_attn_heads = child.num_attn_heads // mp_size
             if hasattr(child, 'all_head_size'):
                 child.all_head_size = child.all_head_size // mp_size
             if hasattr(child, 'embed_dim'):
                 child.embed_dim = child.embed_dim // mp_size
+            if hasattr(child, 'hidden_size'):
+                child.hidden_size = child.hidden_size // mp_size
 
         conv_linear_layer = False
         if linear_layer_setting is not None:
@@ -600,32 +513,193 @@ def replace_transformer_layer(orig_layer_impl,
         return _replace_module(module)
 
     def replace_fn(child, _policy, layer_id=0):
+        training = False  # todo: refactor this part to go in the config
         if training:
             # copy relevant state from child -> new module
-            new_module = replace_with_policy(child,
-                                             _policy,
-                                             triangular_masking,
-                                             preln=preln)
+            new_module = replace_with_policy(child, _policy, config.triangular_masking)
 
         else:
             # copy relevant state from child -> new module
-            if replace_with_kernel_inject:
+            if config.replace_with_kernel_inject:
                 new_module = replace_with_policy(child,
                                                  _policy,
-                                                 triangular_masking,
+                                                 config.triangular_masking,
                                                  inference=True,
-                                                 preln=(_policy
-                                                        is not HFBertLayerPolicy),
                                                  layer_id=layer_id)
             else:
                 new_module = replace_wo_policy(child, _policy)
 
         return new_module
 
-    return replace_module(model=model,
-                          orig_class=orig_layer_impl,
-                          replace_fn=replace_fn,
-                          _replace_policy=policy)
+    replaced_module = replace_module(model=model,
+                                     orig_class=orig_layer_impl,
+                                     replace_fn=replace_fn,
+                                     _replace_policy=config.injection_policy_tuple)
+
+    quantizer = GroupQuantizer(q_int8=quantize)
+    world_size = dist.get_world_size() if dist.is_initialized() else 1
+    rank = dist.get_rank() if dist.is_initialized() else 0
+    if checkpoint_dict is not None:
+        assert container_g.ckpt_load_enabled, \
+               f"Meta Tensor checkpoint loading not supported in {container_g.__class__.__name__} container"
+        start_time = time.time()
+        checkpoint = checkpoint_dict['checkpoints']
+        ckpt_list = checkpoint["tp"] if type(checkpoint) is dict else checkpoint
+        ckpt_type = checkpoint_dict.get('parallelization', 'pp')
+        ckpt_mp_size = checkpoint_dict.get('tp_size', len(ckpt_list))
+        ckpt_mp_size = checkpoint_dict.get('mp_size', ckpt_mp_size)
+        base_dir1 = checkpoint_dict.get('base_dir', config.base_dir)
+
+        if ckpt_type == 'pp' and type(checkpoint) is list:
+            pbar = tqdm.tqdm(total=len(checkpoint),
+                             desc=f"Loading {len(checkpoint)} checkpoint shards")
+
+            for i in range(len(checkpoint)):
+                sd = [
+                    torch.load(os.path.join(base_dir1,
+                                            checkpoint[i]),
+                               map_location='cpu')
+                ]
+                load_model_with_checkpoint(replaced_module,
+                                           sd,
+                                           mp_replace,
+                                           ckpt_type,
+                                           ckpt_mp_size,
+                                           quantizer,
+                                           container=container_g)
+                pbar.update(1)
+        else:
+            import gc
+            num_checkpoints = len(ckpt_list) // ckpt_mp_size
+            tp_split_size = (world_size / ckpt_mp_size)
+            sd_offset = int(rank / tp_split_size)
+            sd_count = int((rank + max(1, tp_split_size)) / tp_split_size) - sd_offset
+            pbar = tqdm.tqdm(total=num_checkpoints,
+                             desc=f"Loading {num_checkpoints} checkpoint shards")
+            for i in range(num_checkpoints):
+                pbar.update(1)
+                ckpt_index = i * ckpt_mp_size + sd_offset
+                ckpt_files = [
+                    os.path.join(base_dir1,
+                                 ckpt_list[ckpt_index +
+                                           j]) if base_dir1 else ckpt_list[ckpt_index +
+                                                                           j]
+                    for j in range(sd_count)
+                ]
+                sds = [
+                    torch.load(ckpt_file,
+                               map_location='cpu') for ckpt_file in ckpt_files
+                ]
+                load_model_with_checkpoint(replaced_module,
+                                           sds,
+                                           mp_replace,
+                                           ckpt_type,
+                                           ckpt_mp_size,
+                                           quantizer,
+                                           int(rank % tp_split_size),
+                                           container=container_g)
+                sds = [None for _ in sds]
+                gc.collect()
+
+            if "non_tp" in checkpoint:
+                pbar = tqdm.tqdm(
+                    total=len(checkpoint["non_tp"]),
+                    desc=f"Loading {len(checkpoint['non_tp'])} checkpoint shards")
+
+                for i in range(len(checkpoint["non_tp"])):
+                    pbar.update(1)
+                    ckpt_file = os.path.join(base_dir1,
+                                             checkpoint["non_tp"][i]
+                                             ) if base_dir1 else checkpoint["non_tp"][i]
+                    sds = [torch.load(ckpt_file, map_location='cpu')]
+                    load_model_with_checkpoint(replaced_module,
+                                               sds,
+                                               mp_replace,
+                                               ckpt_type,
+                                               ckpt_mp_size,
+                                               quantizer,
+                                               int(rank % tp_split_size),
+                                               container=container_g)
+                    sds = [None for _ in sds]
+                    gc.collect()
+        print(f"checkpoint loading time at rank {rank}: {time.time()-start_time} sec")
+
+    if config.save_mp_checkpoint_path is not None:
+        from collections import OrderedDict
+        import json
+        num_partitions = 8
+
+        if checkpoint_dict is None:
+            ckpt_name = "ds_model"
+            try:
+                from transformers.models.bloom.modeling_bloom import BloomForCausalLM
+                if isinstance(model, BloomForCausalLM):
+                    ckpt_name = "bloom"
+            except ImportError:
+                ckpt_name = "ds_model"
+        else:
+            ckpt_name = checkpoint_dict['type']
+        if dist.is_initialized():
+            dist.barrier()
+        transformer_name = get_transformer_name(replaced_module)
+        non_tp_ckpt_name = f'non-tp.pt'
+        ckpt_files = [non_tp_ckpt_name]
+        os.makedirs(config.save_mp_checkpoint_path, exist_ok=True)
+
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            print("Saving tp-sharded checkpoints")
+            torch.save(
+                OrderedDict({
+                    k: v
+                    for k,
+                    v in dict(replaced_module.state_dict()).items()
+                    if transformer_name not in k
+                }),
+                f'{config.save_mp_checkpoint_path}/{non_tp_ckpt_name}')
+            ckpt_config = json.dumps({
+                'type':
+                ckpt_name,
+                'base_dir':
+                f'{config.save_mp_checkpoint_path}',
+                'checkpoints': {
+                    "non_tp":
+                    ckpt_files,
+                    "tp": [
+                        f'tp_{r:0>2d}_{m:0>2d}.pt' for m in range(num_partitions)
+                        for r in range(world_size)
+                    ]
+                },
+                'version':
+                1.0,
+                'parallelization':
+                'tp',
+                'tp_size':
+                world_size,
+                'dtype':
+                'int8' if quantize else ('float16' if fp16 else 'float32')
+            })
+            with open(f"{config.save_mp_checkpoint_path}/ds_inference_config.json",
+                      "w") as cfg:
+                cfg.write(ckpt_config)
+
+        rep_sd = replaced_module.state_dict()
+        for n, p in replaced_module.named_parameters():
+            if hasattr(p, 'scale'):
+                rep_sd[n] = [p, p.scale]
+        keys = list(rep_sd.keys())
+        partition_size = (len(keys) // num_partitions + 1)
+        for m in range(num_partitions):
+            torch.save(
+                OrderedDict({
+                    k: [rep_sd[k],
+                        rep_sd[k].scale] if hasattr(rep_sd[k],
+                                                    'scale') else rep_sd[k]
+                    for k in keys[m * partition_size:(m + 1) * partition_size]
+                    if transformer_name in k
+                }),
+                f'{config.save_mp_checkpoint_path}/tp_{rank:0>2d}_{m:0>2d}.pt')
+
+    return replaced_module
 
 
 def revert_transformer_layer(orig_layer_impl, model, config, preln=False):
@@ -751,4 +825,6 @@ def _replace_module(model, policies, layer_id=0):
         else:
             _, layer_id = _replace_module(child, policies, layer_id=layer_id)
 
+    # Add the reset_cache func to the model, so that it can be called in the beginning of text-generation.
+    model.reset_cache = transformer_inference.DeepSpeedTransformerInference.reset_cache
     return model, layer_id
diff --git a/deepspeed/module_inject/replace_policy.py b/deepspeed/module_inject/replace_policy.py
old mode 100644
new mode 100755
index c8d14e431d0863aa18afbd39f553470f1647d3a8..65dadcc182cf042e17e57043861291c634c376e4
--- a/deepspeed/module_inject/replace_policy.py
+++ b/deepspeed/module_inject/replace_policy.py
@@ -1,374 +1,20 @@
-from abc import ABC
-
-import torch
-from torch.nn.parameter import Parameter
-
-
-class DSPolicy(ABC):
-    def __init__(self,
-                 inference=True,
-                 linear_layer=True,
-                 scale_attention=True,
-                 megatron_v2=False):
-        self.inference = inference
-        self.linear_layer = linear_layer
-        self.scale_attention = scale_attention
-        self.is_megatron_v2 = megatron_v2
-
-    def attention(self):
-        """
-        Returns attention qkv and dense parameters
-        weight: (3*hidden, hidden) and (hidden, hidden)
-        bias: (3*hidden) and (hidden)
-        """
-        raise NotImplementedError
-
-    def get_hidden_heads(self):
-        """
-        return hidden_size and number of heads
-        """
-        raise NotImplementedError
-
-    def mlp(self):
-        """
-        Returns mlp intermediate and output
-        weight: (intermediate, hidden) and (hidden, intermediate)
-        bias: (intermediate) and (hidden)
-        """
-        raise NotImplementedError
-
-    def layerNorm(self):
-        """
-        Returns LayerNorms used in transformer layer
-        Post-Attention and pre/post layer norm
-        gamma and beta with shape: (hidden)
-        """
-        raise NotImplementedError
-
-
-class HFBertLayerPolicy(DSPolicy):
-    _orig_layer_class = None
-
-    def __init__(self, client_module, inference=False, preln=False):
-        super().__init__(inference)
-        self.client_module = client_module
-        self.preln = preln
-        if HFBertLayerPolicy._orig_layer_class is None:
-            try:
-                import transformers
-                HFBertLayerPolicy._orig_layer_class = [
-                    transformers.models.bert.modeling_bert.BertLayer,
-                    transformers.models.roberta.modeling_roberta.RobertaLayer
-                ]
-            except:
-                HFBertLayerPolicy._orig_layer_class = None
-
-    def get_hidden_heads(self):
-        return self.client_module.attention.self.query.weight.shape[1], \
-                self.client_module.attention.self.num_attention_heads
-
-    def attention(self):
-        qw = self.client_module.attention.self.query.weight
-        qb = self.client_module.attention.self.query.bias
-        kw = self.client_module.attention.self.key.weight
-        kb = self.client_module.attention.self.key.bias
-        vw = self.client_module.attention.self.value.weight
-        vb = self.client_module.attention.self.value.bias
-
-        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
-        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=False)
-
-        return self.linear_layer, \
-               qkvw, \
-               qkvb, \
-               self.client_module.attention.output.dense.weight, \
-               self.client_module.attention.output.dense.bias, \
-               self.scale_attention, \
-               self.is_megatron_v2
-
-    def mlp(self):
-        if self.preln:
-            intermediate_ff = self.client_module.intermediate.dense_act
-        else:
-            intermediate_ff = self.client_module.intermediate.dense
-
-        return self.linear_layer, intermediate_ff.weight, intermediate_ff.bias, \
-            self.client_module.output.dense.weight, \
-            self.client_module.output.dense.bias
-
-    def layerNorm(self):
-        if self.preln:
-            attention_layernorm = self.client_module.PostAttentionLayerNorm
-            transformer_layernorm = self.client_module.PreAttentionLayerNorm
-        else:
-            attention_layernorm = self.client_module.attention.output.LayerNorm
-            transformer_layernorm = self.client_module.output.LayerNorm
-        return attention_layernorm.weight, \
-               attention_layernorm.bias, \
-               transformer_layernorm.weight, \
-               transformer_layernorm.bias
-
-
-class HFGPTNEOLayerPolicy(DSPolicy):
-    _orig_layer_class = None
-
-    def __init__(self, client_module, inference=True):
-        super().__init__(inference, scale_attention=False)
-        self.client_module = client_module
-        try:
-            import transformers
-            HFGPTNEOLayerPolicy._orig_layer_class = transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoBlock
-        except:
-            HFGPTNEOLayerPolicy._orig_layer_class = None
-
-    def get_hidden_heads(self):
-        return self.client_module.attn.attention.q_proj.weight.shape[1], \
-                self.client_module.attn.attention.num_heads
-
-    def attention(self):
-        qw = self.client_module.attn.attention.q_proj.weight
-        kw = self.client_module.attn.attention.k_proj.weight
-        vw = self.client_module.attn.attention.v_proj.weight
-
-        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
-
-        return self.linear_layer, \
-                qkvw, \
-                None, \
-                self.client_module.attn.attention.out_proj.weight, \
-                self.client_module.attn.attention.out_proj.bias, \
-                self.scale_attention, \
-               self.is_megatron_v2
-
-    def mlp(self):
-        return self.linear_layer, \
-                self.client_module.mlp.c_fc.weight, \
-                self.client_module.mlp.c_fc.bias, \
-                self.client_module.mlp.c_proj.weight, \
-                self.client_module.mlp.c_proj.bias
-
-    def layerNorm(self):
-        return self.client_module.ln_2.weight, \
-               self.client_module.ln_2.bias, \
-               self.client_module.ln_1.weight, \
-               self.client_module.ln_1.bias
-
-
-class HFGPTJLayerPolicy(DSPolicy):
-    _orig_layer_class = None
-
-    def __init__(self, client_module, inference=True):
-        super().__init__(inference, scale_attention=True)
-        self.client_module = client_module
-        try:
-            import transformers
-            HFGPTJLayerPolicy._orig_layer_class = transformers.models.gptj.modeling_gptj.GPTJBlock
-        except:
-            HFGPTJLayerPolicy._orig_layer_class = None
-
-    def get_hidden_heads(self):
-        return self.client_module.attn.q_proj.weight.shape[1], \
-                self.client_module.attn.num_attention_heads
-
-    def attention(self):
-        qw = self.client_module.attn.q_proj.weight
-        kw = self.client_module.attn.k_proj.weight
-        vw = self.client_module.attn.v_proj.weight
-
-        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
-
-        return self.linear_layer, \
-                qkvw, \
-                None, \
-                self.client_module.attn.out_proj.weight, \
-                None, \
-                self.scale_attention, \
-               self.is_megatron_v2
-
-    def mlp(self):
-        return self.linear_layer, \
-                self.client_module.mlp.fc_in.weight, \
-                self.client_module.mlp.fc_in.bias, \
-                self.client_module.mlp.fc_out.weight, \
-                self.client_module.mlp.fc_out.bias
-
-    def layerNorm(self):
-        return None, \
-               None, \
-               self.client_module.ln_1.weight, \
-               self.client_module.ln_1.bias
-
-
-class MegatronLayerPolicy(DSPolicy):
-    _orig_layer_class = None
-    version = 0
-    moe_type = 'standard'
-
-    def __init__(self, client_module, inference=True):
-        super().__init__(inference)
-        self.client_module = client_module
-        # we use megatron version to differentiate between the old and new
-        # megatron-lm source code
-        if MegatronLayerPolicy._orig_layer_class is None:
-            try:
-                import megatron
-                from megatron.model.transformer import ParallelTransformerLayer
-                MegatronLayerPolicy._orig_layer_class = ParallelTransformerLayer
-            except ImportError:
-                MegatronLayerPolicy._orig_layer_class = None
-
-    def get_hidden_heads(self):
-        return self.client_module.attention.query_key_value.weight.shape[1], \
-                self.client_module.attention.num_attention_heads
-
-    def attention(self):
-        if self.inference:
-            if MegatronLayerPolicy.version == 0:
-                attention = self.client_module.attention
-            else:
-                attention = self.client_module.self_attention
-
-        return self.linear_layer, \
-                attention.query_key_value.weight, \
-                attention.query_key_value.bias, \
-                attention.dense.weight, \
-                attention.dense.bias, \
-                self.scale_attention, \
-                self.is_megatron_v2
-
-    def mlp(self, moe_type='standard'):
-        from deepspeed.moe.utils import has_moe_layers
-        moe, _ = has_moe_layers(self.client_module)
-
-        if moe:
-            moe_experts = self.client_module.mlp.deepspeed_moe.experts.deepspeed_experts if moe_type == 'standard' else \
-                            self.client_module.mlp.moe.deepspeed_moe.experts.deepspeed_experts
-            num_experts = len(moe_experts)
-            if moe_type == 'standard':
-                return self.linear_layer, \
-                    [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
-                    [moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
-                    [moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
-                    [moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)]
-            else:
-
-                return self.linear_layer, \
-                    [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
-                    [moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
-                    [moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
-                    [moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)], \
-                    self.client_module.mlp.mlp.dense_h_to_4h.weight, \
-                    self.client_module.mlp.mlp.dense_h_to_4h.bias, \
-                    self.client_module.mlp.mlp.dense_4h_to_h.weight, \
-                    self.client_module.mlp.mlp.dense_4h_to_h.bias, \
-                    self.client_module.mlp.coefficient.weight
-
-        else:
-            return self.linear_layer, \
-                self.client_module.mlp.dense_h_to_4h.weight, \
-                self.client_module.mlp.dense_h_to_4h.bias, \
-                self.client_module.mlp.dense_4h_to_h.weight, \
-                self.client_module.mlp.dense_4h_to_h.bias
-
-    def layerNorm(self):
-        return self.client_module.post_attention_layernorm.weight, \
-               self.client_module.post_attention_layernorm.bias, \
-               self.client_module.input_layernorm.weight, \
-               self.client_module.input_layernorm.bias
-
-
-class HFGPT2LayerPolicy(DSPolicy):
-    _orig_layer_class = None
-
-    def __init__(self, client_module, inference=True):
-        # HuggingFace GPT2 uses convolutional layer instead of linear layer
-        super().__init__(inference, linear_layer=False)
-        self.client_module = client_module
-        try:
-            import transformers
-            HFGPT2LayerPolicy._orig_layer_class = transformers.models.gpt2.modeling_gpt2.GPT2Block
-        except:
-            HFGPT2LayerPolicy._orig_layer_class = None
-
-    def get_hidden_heads(self):
-        return self.client_module.attn.embed_dim, \
-                self.client_module.attn.num_heads
-
-    def attention(self):
-        return self.linear_layer, \
-                self.client_module.attn.c_attn.weight, \
-                self.client_module.attn.c_attn.bias, \
-                self.client_module.attn.c_proj.weight, \
-                self.client_module.attn.c_proj.bias, \
-                self.scale_attention, \
-                self.is_megatron_v2
-
-    def mlp(self):
-        return self.linear_layer, \
-            self.client_module.mlp.c_fc.weight, \
-            self.client_module.mlp.c_fc.bias, \
-            self.client_module.mlp.c_proj.weight, \
-            self.client_module.mlp.c_proj.bias
-
-    def layerNorm(self):
-        return self.client_module.ln_2.weight, \
-               self.client_module.ln_2.bias, \
-               self.client_module.ln_1.weight, \
-               self.client_module.ln_1.bias
-
-
-class GPTNEOXLayerPolicy(DSPolicy):
-    _orig_layer_class = None
-    version = 0
-
-    def __init__(self, client_module, inference=True, megatron_v2=True):
-        super().__init__(inference, megatron_v2=megatron_v2)
-        self.client_module = client_module
-        if GPTNEOXLayerPolicy._orig_layer_class is None:
-            try:
-                import megatron
-                from megatron.model.transformer import ParallelTransformerLayerPipe
-                GPTNEOXLayerPolicy._orig_layer_class = ParallelTransformerLayerPipe
-            except ImportError:
-                GPTNEOXLayerPolicy._orig_layer_class = None
-
-    def get_hidden_heads(self):
-        if GPTNEOXLayerPolicy.version == 0:
-            attention = self.client_module.attention
-        else:
-            attention = self.client_module.self_attention
-
-        return self.client_module.attention.query_key_value.weight.shape[1], \
-                self.client_module.attention.num_attention_heads
-
-    def attention(self):
-        if GPTNEOXLayerPolicy.version == 0:
-            attention = self.client_module.attention
-        else:
-            attention = self.client_module.self_attention
-
-        return self.linear_layer, \
-                attention.query_key_value.weight, \
-                attention.query_key_value.bias, \
-                attention.dense.weight, \
-                attention.dense.bias, \
-                self.scale_attention, \
-                self.is_megatron_v2
-
-    def mlp(self):
-        return self.linear_layer, \
-            self.client_module.mlp.dense_h_to_4h.weight, \
-            self.client_module.mlp.dense_h_to_4h.bias, \
-            self.client_module.mlp.dense_4h_to_h.weight, \
-            self.client_module.mlp.dense_4h_to_h.bias
-
-    def layerNorm(self):
-        return self.client_module.post_attention_layernorm.weight, \
-               self.client_module.post_attention_layernorm.bias, \
-               self.client_module.input_layernorm.weight, \
-               self.client_module.input_layernorm.bias
-
-
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+from .containers import HFGPT2LayerPolicy
+from .containers import HFBertLayerPolicy
+from .containers import BLOOMLayerPolicy
+from .containers import HFGPTJLayerPolicy
+from .containers import HFGPTNEOLayerPolicy
+from .containers import GPTNEOXLayerPolicy
+from .containers import HFOPTLayerPolicy
+from .containers import MegatronLayerPolicy
+from .containers import HFDistilBertLayerPolicy
+from .containers import HFCLIPLayerPolicy
+from .containers import UNetPolicy
+from .containers import VAEPolicy
+
+# transformer-based policies
 replace_policies = [
     HFBertLayerPolicy,
     HFGPTNEOLayerPolicy,
@@ -376,4 +22,11 @@ replace_policies = [
     HFGPTJLayerPolicy,
     MegatronLayerPolicy,
     HFGPT2LayerPolicy,
+    BLOOMLayerPolicy,
+    HFOPTLayerPolicy,
+    HFCLIPLayerPolicy,
+    HFDistilBertLayerPolicy
 ]
+
+# non-transformer-based policies
+generic_policies = [UNetPolicy, VAEPolicy]
diff --git a/deepspeed/module_inject/utils.py b/deepspeed/module_inject/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ebd797ec31cbc43c0e26127b0b74faa8798aa25
--- /dev/null
+++ b/deepspeed/module_inject/utils.py
@@ -0,0 +1,40 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from deepspeed.utils import log_dist
+
+
+# helper function to map between DS policies and DS containers
+def policy_to_ds_container(**kwargs):
+    from .containers import HFGPT2LayerPolicy, DS_GPT2Container
+    from .containers import HFBertLayerPolicy, DS_BERTContainer
+    from .containers import BLOOMLayerPolicy, DS_BloomContainer
+    from .containers import HFGPTJLayerPolicy, DS_GPTJContainer
+    from .containers import HFGPTNEOLayerPolicy, DS_GPTNEOContainer
+    from .containers import GPTNEOXLayerPolicy, DS_GPTNEOXContainer
+    from .containers import HFOPTLayerPolicy, DS_OPTContainer
+    from .containers import MegatronLayerPolicy, DS_MegatronGPTContainer
+    from .containers import HFDistilBertLayerPolicy, DS_DistilBERTContainer
+
+    policy_to_container = {
+        HFGPT2LayerPolicy: DS_GPT2Container,
+        HFBertLayerPolicy: DS_BERTContainer,
+        BLOOMLayerPolicy: DS_BloomContainer,
+        HFGPTJLayerPolicy: DS_GPTJContainer,
+        HFGPTNEOLayerPolicy: DS_GPTNEOContainer,
+        GPTNEOXLayerPolicy: DS_GPTNEOXContainer,
+        HFOPTLayerPolicy: DS_OPTContainer,
+        MegatronLayerPolicy: DS_MegatronGPTContainer,
+        HFDistilBertLayerPolicy: DS_DistilBERTContainer,
+    }
+
+    container = None
+    policy = kwargs['policy']
+    assert policy is not None, "Policy cannot be None"
+    policy_type = type(policy)
+
+    if policy_type not in policy_to_container:
+        log_dist(f"Policy type {policy_type} not supported", [0])
+    else:
+        container = policy_to_container[policy_type](**kwargs)
+
+    return container
diff --git a/deepspeed/moe/__init__.py b/deepspeed/moe/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..fcb45ab2b68516814a4bfbffebf2e01cbfefd527 100644
--- a/deepspeed/moe/__init__.py
+++ b/deepspeed/moe/__init__.py
@@ -0,0 +1 @@
+'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/moe/layer.py b/deepspeed/moe/layer.py
index c596da4903e8bfb0dfd81bd739b20508f7077f11..6b4a07642cc8692113ab778d9f83d3d2f42b909f 100644
--- a/deepspeed/moe/layer.py
+++ b/deepspeed/moe/layer.py
@@ -2,20 +2,35 @@
 Copyright 2020 The Microsoft DeepSpeed Team
 '''
 
-import torch.nn.init as init
 import torch
-import torch.distributed as dist
 
-from deepspeed.utils import logger, log_dist
+from deepspeed.utils import log_dist
 
-import deepspeed.utils.groups as groups
+from deepspeed.utils import groups
 from .sharded_moe import MOELayer, TopKGate
 from .experts import Experts
-import copy
 import typing
 
 
 class MoE(torch.nn.Module):
+    """Initialize an MoE layer.
+
+    Arguments:
+        hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension.
+        expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear).
+        num_experts (int, optional): default=1, the total number of experts per layer.
+        ep_size (int, optional): default=1, number of ranks in the expert parallel world or group.
+        k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
+        capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
+        eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
+        min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
+        use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
+        noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'.
+        drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
+        use_rts (bool, optional): default=True, whether to use Random Token Selection.
+        use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed).
+        enable_expert_tensor_parallelism (bool, optional): default=False, whether to use tensor parallelism for experts
+    """
     def __init__(self,
                  hidden_size,
                  expert,
@@ -29,37 +44,21 @@ class MoE(torch.nn.Module):
                  noisy_gate_policy: typing.Optional[str] = None,
                  drop_tokens: bool = True,
                  use_rts=True,
-                 use_tutel: bool = False):
-        """Initialize an MoE layer.
-
-        Arguments:
-            hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension.
-            expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear).
-            num_experts (int, optional): default=1, the total number of experts per layer.
-            ep_size (int, optional): default=1, number of ranks in the expert parallel world or group.
-            k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
-            capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
-            eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
-            min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
-            use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
-            noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'.
-            drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
-            use_rts (bool, optional): default=True, whether to use Random Token Selection.
-            use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed).
-        """
+                 use_tutel: bool = False,
+                 enable_expert_tensor_parallelism: bool = False):
 
         super(MoE, self).__init__()
 
         self.use_residual = use_residual
-        self.ep_size = min(
-            ep_size,
-            num_experts)  # the ep size should be less than the number of experts
+        self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism
+        assert num_experts % ep_size == 0, f"Number of experts ({num_experts}) should be divisible by expert parallel size ({ep_size})"
+        self.ep_size = ep_size
         self.expert_group_name = f"ep_size_{self.ep_size}"
         self.num_experts = num_experts
-        self.num_local_experts = 1 if num_experts < ep_size else num_experts // ep_size
+        self.num_local_experts = num_experts // self.ep_size
 
         log_dist(
-            f'Creating MoE layer with num_experts: {num_experts} | num_local_experts: {self.num_local_experts} | expert_parallel_size: {ep_size}',
+            f'Creating MoE layer with num_experts: {num_experts} | num_local_experts: {self.num_local_experts} | expert_parallel_size: {self.ep_size}',
             [0])
 
         assert noisy_gate_policy is None or noisy_gate_policy in ['None', 'Jitter', 'RSample'], \
@@ -94,9 +93,12 @@ class MoE(torch.nn.Module):
             print(
                 f"No existing process group found, creating a new group named: {self.expert_group_name}"
             )
-            if groups.mpu is None:
+            if (groups.mpu is None) or (not self.enable_expert_tensor_parallelism):
+                # Condition 1 - no groups.mpu means no tensor parallelism
+                # Condition 2 - disabling expert tensor parallelism on purpose
                 groups._create_expert_and_data_parallel(self.ep_size)
             else:
+                # expert tensor parallelism is enabled
                 groups._create_expert_data_and_model_parallel(self.ep_size,
                                                               mpu=groups.mpu)
         # Set the group handle for the MOELayer (deepspeed_moe) object
@@ -126,6 +128,6 @@ class MoE(torch.nn.Module):
             if type(output_mlp) is tuple:
                 output_mlp = output_mlp[0]  # Ignore the bias term for now
             coef = self.coefficient(hidden_states)
-            coef = torch.nn.functional.softmax(coef, dim=1)
+            coef = torch.nn.functional.softmax(coef, dim=-1)
             output = output * coef[..., 0:1] + output_mlp * coef[..., 1:]
         return output, self.deepspeed_moe.l_aux, self.deepspeed_moe.exp_counts
diff --git a/deepspeed/moe/mappings.py b/deepspeed/moe/mappings.py
new file mode 100644
index 0000000000000000000000000000000000000000..38f1630a67034990f8bb4e8cc75670e7f6a5f552
--- /dev/null
+++ b/deepspeed/moe/mappings.py
@@ -0,0 +1,108 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+# The file has been adapted from the following Megatron-LM file:
+# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/mpu/mappings.py
+# Git commit hash: 9dc3c42a84aa656f583703cf8b6b4f79f712b796
+# We retain the following copyright from the original files:
+
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import deepspeed
+
+
+def _gather_tokens(input_, dim=0):
+    """Gather tensors and concatenate them along a dimension"""
+    mpu = deepspeed.utils.groups.mpu
+
+    input_ = input_.contiguous()
+    # Size and dimension.
+    rank = mpu.get_tensor_model_parallel_rank()
+
+    tensor_list = [
+        torch.empty_like(input_)
+        for _ in range(mpu.get_tensor_model_parallel_world_size())
+    ]
+    tensor_list[rank] = input_
+    deepspeed.comm.all_gather(tensor_list,
+                              input_,
+                              group=mpu.get_tensor_model_parallel_group())
+
+    # Note: torch.cat already creates a contiguous tensor.
+    output = torch.cat(tensor_list, dim=dim).contiguous()
+
+    return output
+
+
+def _drop_tokens(input_, dim=0):
+    """Divide a tensor among the tensor parallel ranks"""
+    mpu = deepspeed.utils.groups.mpu
+
+    total_chunks = mpu.get_tensor_model_parallel_world_size()
+    this_chunk = mpu.get_tensor_model_parallel_rank()
+    assert input_.shape[dim] % total_chunks == 0, f"input dimension {dim} ({input_.shape[dim]}) is not divisible by tensor parallel world size ({total_chunks})"
+    chunk_size = input_.shape[dim] // total_chunks
+
+    return torch.narrow(input_, dim, this_chunk * chunk_size, chunk_size)
+
+
+class _GatherTokens(torch.autograd.Function):
+    """All gather tokens among the tensor parallel ranks"""
+    @staticmethod
+    def symbolic(graph, input_, dim):
+        return _gather_tokens(input_, dim)
+
+    @staticmethod
+    def forward(ctx, input_, dim):
+        ctx.dim = dim
+        return _gather_tokens(input_, dim)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _drop_tokens(grad_output, ctx.dim), None
+
+
+class _DropTokens(torch.autograd.Function):
+    "Divide tokens equally among the tensor parallel ranks"
+
+    @staticmethod
+    def symbolic(graph, input_, dim):
+        return _drop_tokens(input_, dim)
+
+    @staticmethod
+    def forward(ctx, input_, dim):
+        ctx.dim = dim
+        return _drop_tokens(input_, dim)
+
+    @staticmethod
+    def backward(ctx, input_):
+        return _gather_tokens(input_, ctx.dim), None
+
+
+def gather_tokens(input_, dim=0):
+    mpu = deepspeed.utils.groups.mpu
+    if mpu is None or mpu.get_tensor_model_parallel_world_size() == 1:
+        # no tensor parallelism for non-experts
+        return input_
+    return _GatherTokens.apply(input_, dim)
+
+
+def drop_tokens(input_, dim=0):
+    mpu = deepspeed.utils.groups.mpu
+    if mpu is None or mpu.get_tensor_model_parallel_world_size() == 1:
+        # no tensor parallelism for non-experts
+        return input_
+    return _DropTokens.apply(input_, dim)
diff --git a/deepspeed/moe/sharded_moe.py b/deepspeed/moe/sharded_moe.py
index 024de2f51e05c2e12356c3c67b1af820e80acc7d..211b2127b026db48cc63092f9bdaf72e50681e20 100644
--- a/deepspeed/moe/sharded_moe.py
+++ b/deepspeed/moe/sharded_moe.py
@@ -12,17 +12,16 @@ Copyright 2021 The Microsoft DeepSpeed Team
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
-from deepspeed.utils.timer import ThroughputTimer, SynchronizedWallClockTimer
-from deepspeed.utils import logger, log_dist
-from typing import Callable, Dict, TYPE_CHECKING, Any, Optional, Tuple, Union, cast
+from deepspeed.utils.timer import SynchronizedWallClockTimer
+from deepspeed.utils import logger
+from typing import Callable, Dict, TYPE_CHECKING, Any, Optional, Tuple
 
-import time
-from time import perf_counter
 import torch
 from torch import Tensor
-import torch.distributed as dist
-from torch.nn import Module, ModuleList
+from torch.nn import Module
 import torch.nn.functional as F
+from deepspeed.utils import groups
+from .mappings import drop_tokens, gather_tokens
 
 if TYPE_CHECKING:
     Base = Module[Tensor]
@@ -80,12 +79,20 @@ def gumbel_rsample(shape: Tuple, device: torch.device) -> Tensor:
     return gumbel(shape)
 
 
+from deepspeed import comm as dist
+
+# einsum dimensions: (g)roup, (s)equence, (e)xpert, (m)odel, (c)apacity
+# See https://arxiv.org/pdf/2006.16668.pdf for details.
+
+
 # Based on https://github.com/pytorch/pytorch/pull/40762
 class _AllToAll(torch.autograd.Function):
     @staticmethod
-    def forward(ctx: Any,
-                group: dist.ProcessGroup,
-                input: Tensor) -> Tensor:  # type: ignore
+    def forward(
+            ctx: Any,
+            # TODO: replace with DS process group
+            group: torch.distributed.ProcessGroup,
+            input: Tensor) -> Tensor:  # type: ignore
         ctx.group = group
         input = input.contiguous()
         output = torch.empty_like(input)
@@ -206,7 +213,7 @@ def top1gating(logits: Tensor,
     # if we don't want to drop any tokens
     if not drop_tokens:
         new_capacity = torch.max(exp_counts).to(logits.device)
-        dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=dist.group.WORLD)
+        dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=dist.get_world_group())
         capacity = new_capacity
 
     # Compute l_aux
@@ -424,7 +431,7 @@ class TopKGate(Module):
 
         if self.wall_clock_breakdown:
             self.timers('TopKGate').stop()
-            self.gate_time = self.timers('TopKGate').elapsed(reset=False) * 1000
+            self.gate_time = self.timers('TopKGate').elapsed(reset=False)
 
         return gate_output
 
@@ -466,13 +473,17 @@ class MOELayer(Base):
         self.timers = SynchronizedWallClockTimer()
         self.wall_clock_breakdown = False
 
-        self.use_tutel = use_tutel and TUTEL_INSTALLED
+        self.use_tutel = use_tutel and TUTEL_INSTALLED and gate.k == 1
 
         if self.use_tutel:
             logger.info('Using Tutel optimizations.')
         elif use_tutel and not TUTEL_INSTALLED:
             logger.warning("Tutel optimization requested but not installed. "
                            "Proceeding without Tutel.")
+        elif use_tutel and TUTEL_INSTALLED and gate.k != 1:
+            logger.warning(
+                "To enable Tutel optimization, use top-1 instead of top-2 gate. "
+                "Proceeding without Tutel.")
 
     def _set_ep_group(self, ep_group):
         self.ep_group = ep_group
@@ -511,11 +522,20 @@ class MOELayer(Base):
         if self.wall_clock_breakdown:
             self.timers('falltoall').start()
 
+        if groups._get_expert_model_parallel_world_size() == 1:
+            # If the non-expert is tensor-parallel, it will create
+            # duplicate tokens on the tensor-parallel ranks.
+            # Since our experts are not tensor-parallel, these duplicates
+            # need to be dropped to ensure correctness.
+            # this also doubles up as a communication optimization as we are
+            # reducing the all-to-all communication volume.
+            dispatched_input = drop_tokens(dispatched_input, dim=1)
+
         dispatched_input = _AllToAll.apply(self.ep_group, dispatched_input)
 
         if self.wall_clock_breakdown:
             self.timers('falltoall').stop()
-            self.time_falltoall = self.timers('falltoall').elapsed(reset=False) * 1000
+            self.time_falltoall = self.timers('falltoall').elapsed(reset=False)
 
         # Re-shape after all-to-all: ecm -> gecm
         dispatched_input = dispatched_input.reshape(self.ep_size,
@@ -532,13 +552,19 @@ class MOELayer(Base):
 
         if self.wall_clock_breakdown:
             self.timers('salltoall').stop()
-            self.time_salltoall = self.timers('salltoall').elapsed(reset=False) * 1000
+            self.time_salltoall = self.timers('salltoall').elapsed(reset=False)
 
         # Re-shape back: gecm -> ecm
         expert_output = expert_output.reshape(self.ep_size * self.num_local_experts,
                                               -1,
                                               d_model)
 
+        if groups._get_expert_model_parallel_world_size() == 1:
+            # the dropped duplicate tokens need to be gathered on each
+            # tensor parallel rank again for the tensor-parallel
+            # non-expert of the next layer.
+            expert_output = gather_tokens(expert_output, dim=1)
+
         if self.use_tutel:
             combined_output = self._tutel_dispatcher.decode(expert_output.view(E * C, M))
         else:
@@ -550,6 +576,6 @@ class MOELayer(Base):
 
         if self.wall_clock_breakdown:
             self.timers('moe').stop()
-            self.time_moe = self.timers('moe').elapsed(reset=False) * 1000
+            self.time_moe = self.timers('moe').elapsed(reset=False)
 
         return a
diff --git a/deepspeed/moe/utils.py b/deepspeed/moe/utils.py
index 09922937e6d825abc99a669b638a513c2c464042..1bf52795195b50dfab0e69e9c79f737802c1456c 100644
--- a/deepspeed/moe/utils.py
+++ b/deepspeed/moe/utils.py
@@ -1,12 +1,14 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from typing import List, Tuple, Dict
 import torch
-import deepspeed.utils.groups as groups
 from .layer import MoE
 
 
 def has_moe_layers(m):
     has_moe = False
     num_experts = 0
+
     for _, module in m.named_modules():
         if isinstance(module, MoE):
             has_moe = True
@@ -59,8 +61,9 @@ def split_params_grads_into_shared_and_expert_params(
     return shared_grads, expert_grads
 
 
-def split_params_into_different_moe_groups_for_optimizer(
-        param_groups: Tuple[Dict]) -> Tuple[Dict]:
+def split_params_into_different_moe_groups_for_optimizer(param_groups: Tuple[Dict],
+                                                         max_group_size=178956971
+                                                         ) -> Tuple[Dict]:
     """Split parameters into different MoE groups for optimizer
 
     Args:
@@ -112,8 +115,32 @@ def split_params_into_different_moe_groups_for_optimizer(
         param_group['params'] = new_params
 
     # Flatten the moe groups
-    for k, v in group_moe.items():
-        for k1, v1 in v.items():
-            param_groups.append(v1)
+    if max_group_size is not None:
+        for k, v in group_moe.items():
+            for k1, v1 in v.items():
+                cur_group = []
+                all_groups = []
+                size_of_cur_group = 0
+                for param in v1['params']:
+                    if size_of_cur_group + param.numel() <= max_group_size:
+                        cur_group.append(param)
+                        size_of_cur_group += param.numel()
+                    else:
+                        all_groups.append(cur_group)
+                        cur_group = [param]
+                        size_of_cur_group = param.numel()
+                if cur_group:
+                    all_groups.append(cur_group)
+                for group in all_groups:
+                    new_dict = {}
+                    for key, val in v1.items():
+                        if key != 'params':
+                            new_dict[key] = val
+                    new_dict['params'] = group
+                    param_groups.append(new_dict)
+    else:
+        for k, v in group_moe.items():
+            for k1, v1 in v.items():
+                param_groups.append(v1)
 
     return tuple(param_groups)
diff --git a/deepspeed/monitor/__init__.py b/deepspeed/monitor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcb45ab2b68516814a4bfbffebf2e01cbfefd527
--- /dev/null
+++ b/deepspeed/monitor/__init__.py
@@ -0,0 +1 @@
+'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/monitor/config.py b/deepspeed/monitor/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..09ba7ef1af47ac096eac02a9a414032547023e95
--- /dev/null
+++ b/deepspeed/monitor/config.py
@@ -0,0 +1,87 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+"""
+Copyright (c) Microsoft Corporation
+Licensed under the MIT license.
+"""
+
+from pydantic import root_validator
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+
+
+def get_monitor_config(param_dict):
+    monitor_dict = {
+        key: param_dict.get(key,
+                            {})
+        for key in ("tensorboard",
+                    "wandb",
+                    "csv_monitor")
+    }
+    return DeepSpeedMonitorConfig(**monitor_dict)
+
+
+class TensorBoardConfig(DeepSpeedConfigModel):
+    """Sets parameters for TensorBoard monitor."""
+
+    enabled: bool = False
+    """ Whether logging to Tensorboard is enabled. Requires `tensorboard` package is installed. """
+
+    output_path: str = ""
+    """
+    Path to where the Tensorboard logs will be written. If not provided, the
+    output path is set under the training script’s launching path.
+    """
+
+    job_name: str = "DeepSpeedJobName"
+    """ Name for the current job. This will become a new directory inside `output_path`. """
+
+
+class WandbConfig(DeepSpeedConfigModel):
+    """Sets parameters for WandB monitor."""
+
+    enabled: bool = False
+    """ Whether logging to WandB is enabled. Requires `wandb` package is installed. """
+
+    group: str = None
+    """ Name for the WandB group. This can be used to group together runs. """
+
+    team: str = None
+    """ Name for the WandB team. """
+
+    project: str = "deepspeed"
+    """ Name for the WandB project. """
+
+
+class CSVConfig(DeepSpeedConfigModel):
+    """Sets parameters for CSV monitor."""
+
+    enabled: bool = False
+    """ Whether logging to local CSV files is enabled. """
+
+    output_path: str = ""
+    """
+    Path to where the csv files will be written. If not provided, the output
+    path is set under the training script’s launching path.
+    """
+
+    job_name: str = "DeepSpeedJobName"
+    """ Name for the current job. This will become a new directory inside `output_path`. """
+
+
+class DeepSpeedMonitorConfig(DeepSpeedConfigModel):
+    """Sets parameters for various monitoring methods."""
+
+    tensorboard: TensorBoardConfig = {}
+    """ TensorBoard monitor, requires `tensorboard` package is installed. """
+
+    wandb: WandbConfig = {}
+    """ WandB monitor, requires `wandb` package is installed. """
+
+    csv_monitor: CSVConfig = {}
+    """ Local CSV output of monitoring data. """
+    @root_validator
+    def check_enabled(cls, values):
+        values["enabled"] = False
+        if (values.get("tensorboard").enabled or values.get("wandb").enabled
+                or values.get("csv_monitor").enabled):
+            values["enabled"] = True
+        return values
diff --git a/deepspeed/monitor/csv_monitor.py b/deepspeed/monitor/csv_monitor.py
new file mode 100644
index 0000000000000000000000000000000000000000..7de4fbcede58392200a4249f6ec03058bc0ec23d
--- /dev/null
+++ b/deepspeed/monitor/csv_monitor.py
@@ -0,0 +1,63 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .monitor import Monitor
+import os
+
+import deepspeed.comm as dist
+
+
+class csvMonitor(Monitor):
+    def __init__(self, csv_config):
+        super().__init__(csv_config)
+        self.filenames = []
+        self.enabled = csv_config.enabled
+        self.output_path = csv_config.output_path
+        self.job_name = csv_config.job_name
+        self.log_dir = self.setup_log_dir()
+
+    def setup_log_dir(self, base=os.path.join(os.path.expanduser("~"), "csv_monitor")):
+        if self.enabled and dist.get_rank() == 0:
+            if self.output_path is not None:
+                log_dir = os.path.join(self.output_path, self.job_name)
+            # NOTE: This code path currently is never used since the default tensorboard_output_path is an empty string and not None. Saving it in case we want this functionality in the future.
+            else:
+                if "DLWS_JOB_ID" in os.environ:
+                    infra_job_id = os.environ["DLWS_JOB_ID"]
+                elif "DLTS_JOB_ID" in os.environ:
+                    infra_job_id = os.environ["DLTS_JOB_ID"]
+                else:
+                    infra_job_id = "unknown-job-id"
+
+                csv_monitor_dir_name = os.path.join(infra_job_id, "logs")
+                log_dir = os.path.join(base, csv_monitor_dir_name, self.job_name)
+            os.makedirs(log_dir, exist_ok=True)
+            return log_dir
+
+    def write_events(self, event_list):
+        if self.enabled and dist.get_rank() == 0:
+            import csv
+            # We assume each event_list element is a tensorboard-style tuple in the format: (log_name: String, value, step: Int)
+            for event in event_list:
+                log_name = event[0]
+                value = event[1]
+                step = event[2]
+
+                # Set the header to the log_name
+                # Need this check because the deepspeed engine currently formats log strings to separate with '/'
+                if '/' in log_name:
+                    record_splits = log_name.split('/')
+                    header = record_splits[len(record_splits) - 1]
+                else:
+                    header = log_name
+
+                # sanitize common naming conventions into filename
+                filename = log_name.replace('/', '_').replace(' ', '_')
+                fname = self.log_dir + '/' + filename + '.csv'
+
+                # Open file and record event. Insert header if this is the first time writing
+                with open(fname, 'a+') as csv_monitor_file:
+                    csv_monitor_writer = csv.writer(csv_monitor_file)
+                    if filename not in self.filenames:
+                        self.filenames.append(filename)
+                        csv_monitor_writer.writerow(['step', header])
+                    csv_monitor_writer.writerow([step, value])
diff --git a/deepspeed/monitor/monitor.py b/deepspeed/monitor/monitor.py
new file mode 100644
index 0000000000000000000000000000000000000000..504c3da391f5ce9ce6334729956592973cf554ec
--- /dev/null
+++ b/deepspeed/monitor/monitor.py
@@ -0,0 +1,48 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+"""
+ Support different forms of monitoring such as wandb and tensorboard
+"""
+
+from abc import ABC, abstractmethod
+import deepspeed.comm as dist
+
+
+class Monitor(ABC):
+    @abstractmethod
+    def __init__(self, monitor_config):
+        self.monitor_config = monitor_config
+
+    @abstractmethod
+    def write_events(self, event_list):
+        pass
+
+
+from .wandb import WandbMonitor
+from .tensorboard import TensorBoardMonitor
+from .csv_monitor import csvMonitor
+
+
+class MonitorMaster(Monitor):
+    def __init__(self, monitor_config):
+        super().__init__(monitor_config)
+        self.tb_monitor = None
+        self.wandb_monitor = None
+        self.csv_monitor = None
+        self.enabled = monitor_config.enabled
+
+        if dist.get_rank() == 0:
+            if monitor_config.tensorboard.enabled:
+                self.tb_monitor = TensorBoardMonitor(monitor_config.tensorboard)
+            if monitor_config.wandb.enabled:
+                self.wandb_monitor = WandbMonitor(monitor_config.wandb)
+            if monitor_config.csv_monitor.enabled:
+                self.csv_monitor = csvMonitor(monitor_config.csv_monitor)
+
+    def write_events(self, event_list):
+        if dist.get_rank() == 0:
+            if self.tb_monitor is not None:
+                self.tb_monitor.write_events(event_list)
+            if self.wandb_monitor is not None:
+                self.wandb_monitor.write_events(event_list)
+            if self.csv_monitor is not None:
+                self.csv_monitor.write_events(event_list)
diff --git a/deepspeed/monitor/tensorboard.py b/deepspeed/monitor/tensorboard.py
new file mode 100644
index 0000000000000000000000000000000000000000..db3a50d75e236ef21db55eded602109e77eeaceb
--- /dev/null
+++ b/deepspeed/monitor/tensorboard.py
@@ -0,0 +1,54 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .utils import check_tb_availability
+from .monitor import Monitor
+import os
+
+import deepspeed.comm as dist
+
+
+class TensorBoardMonitor(Monitor):
+    def __init__(self, tensorboard_config):
+        super().__init__(tensorboard_config)
+        check_tb_availability()
+
+        self.summary_writer = None
+        self.enabled = tensorboard_config.enabled
+        self.output_path = tensorboard_config.output_path
+        self.job_name = tensorboard_config.job_name
+
+        if self.enabled and dist.get_rank() == 0:
+            self.get_summary_writer()
+
+    def get_summary_writer(self,
+                           base=os.path.join(os.path.expanduser("~"),
+                                             "tensorboard")):
+        if self.enabled and dist.get_rank() == 0:
+            from torch.utils.tensorboard import SummaryWriter
+            if self.output_path is not None:
+                log_dir = os.path.join(self.output_path, self.job_name)
+            # NOTE: This code path currently is never used since the default output_path is an empty string and not None. Saving it in case we want this functionality in the future.
+            else:
+                if "DLWS_JOB_ID" in os.environ:
+                    infra_job_id = os.environ["DLWS_JOB_ID"]
+                elif "DLTS_JOB_ID" in os.environ:
+                    infra_job_id = os.environ["DLTS_JOB_ID"]
+                else:
+                    infra_job_id = "unknown-job-id"
+
+                summary_writer_dir_name = os.path.join(infra_job_id, "logs")
+                log_dir = os.path.join(base, summary_writer_dir_name, self.output_path)
+            os.makedirs(log_dir, exist_ok=True)
+            self.summary_writer = SummaryWriter(log_dir=log_dir)
+        return self.summary_writer
+
+    def write_events(self, event_list, flush=True):
+        if self.enabled and self.summary_writer is not None and dist.get_rank() == 0:
+            for event in event_list:
+                self.summary_writer.add_scalar(*event)
+            if flush:
+                self.summary_writer.flush()
+
+    def flush(self):
+        if self.enabled and self.summary_writer is not None and dist.get_rank() == 0:
+            self.summary_writer.flush()
diff --git a/deepspeed/monitor/utils.py b/deepspeed/monitor/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa37e8d27cff5a05898697650d65886cdb30a289
--- /dev/null
+++ b/deepspeed/monitor/utils.py
@@ -0,0 +1,21 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+
+def check_tb_availability():
+    try:
+        # torch.utils.tensorboard will fail if `tensorboard` is not available,
+        # see their docs for more details: https://pytorch.org/docs/1.8.0/tensorboard.html
+        import tensorboard  # noqa: F401
+    except ImportError:
+        print('If you want to use tensorboard logging, please `pip install tensorboard`')
+        raise
+
+
+def check_wandb_availability():
+    try:
+        import wandb  # noqa: F401
+    except ImportError:
+        print(
+            'If you want to use wandb logging, please `pip install wandb` and follow the instructions at https://docs.wandb.ai/quickstart'
+        )
+        raise
diff --git a/deepspeed/monitor/wandb.py b/deepspeed/monitor/wandb.py
new file mode 100644
index 0000000000000000000000000000000000000000..49fc0332070fde62e9125137f9d310ecd51bfe44
--- /dev/null
+++ b/deepspeed/monitor/wandb.py
@@ -0,0 +1,34 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .utils import check_wandb_availability
+from .monitor import Monitor
+
+import deepspeed.comm as dist
+
+
+class WandbMonitor(Monitor):
+    def __init__(self, wandb_config):
+        super().__init__(wandb_config)
+        check_wandb_availability()
+        import wandb
+
+        self.enabled = wandb_config.enabled
+        self.group = wandb_config.group
+        self.team = wandb_config.team
+        self.project = wandb_config.project
+
+        if self.enabled and dist.get_rank() == 0:
+            wandb.init(project=self.project, group=self.group, entity=self.team)
+
+    def log(self, data, step=None, commit=None, sync=None):
+        if self.enabled and dist.get_rank() == 0:
+            import wandb
+            return wandb.log(data, step=step, commit=commit, sync=sync)
+
+    def write_events(self, event_list):
+        if self.enabled and dist.get_rank() == 0:
+            for event in event_list:
+                label = event[0]
+                value = event[1]
+                step = event[2]
+                self.log({label: value}, step=step)
diff --git a/deepspeed/nebula/__init__.py b/deepspeed/nebula/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcb45ab2b68516814a4bfbffebf2e01cbfefd527
--- /dev/null
+++ b/deepspeed/nebula/__init__.py
@@ -0,0 +1 @@
+'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/nebula/config.py b/deepspeed/nebula/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..93b879fd77b78411b554a65d597e6c981781435f
--- /dev/null
+++ b/deepspeed/nebula/config.py
@@ -0,0 +1,54 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+"""
+Copyright (c) Microsoft Corporation
+Licensed under the MIT license.
+"""
+
+from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
+from deepspeed.nebula.constants import *
+
+
+class DeepSpeedNebulaConfig(DeepSpeedConfigObject):
+    def __init__(self, param_dict):
+        super(DeepSpeedNebulaConfig, self).__init__()
+
+        self.enabled = None
+        self.persistent_storage_path = None
+        self.persistent_time_interval = None
+        self.num_of_version_in_retention = None
+        self.enable_nebula_load = None
+
+        if NEBULA in param_dict.keys():
+            nebula_dict = param_dict[NEBULA]
+        else:
+            nebula_dict = {}
+
+        self._initialize(nebula_dict)
+
+    def _initialize(self, nebula_dict):
+        self.enabled = get_scalar_param(nebula_dict,
+                                        NEBULA_ENABLED,
+                                        NEBULA_ENABLED_DEFAULT)
+
+        self.load_path = get_scalar_param(nebula_dict,
+                                          NEBULA_LOAD_PATH,
+                                          NEBULA_LOAD_PATH_DEFAULT)
+
+        self.enable_nebula_load = get_scalar_param(nebula_dict,
+                                                   NEBULA_ENABLE_NEBULA_LOAD,
+                                                   NEBULA_ENABLE_NEBULA_LOAD_DEFAULT)
+
+        self.persistent_storage_path = get_scalar_param(
+            nebula_dict,
+            NEBULA_PERSISTENT_STORAGE_PATH,
+            NEBULA_PERSISTENT_STORAGE_PATH_DEFAULT)
+
+        self.persistent_time_interval = get_scalar_param(
+            nebula_dict,
+            NEBULA_PERSISTENT_TIME_INTERVAL,
+            NEBULA_PERSISTENT_TIME_INTERVAL_DEFAULT)
+
+        self.num_of_version_in_retention = get_scalar_param(
+            nebula_dict,
+            NEBULA_NUM_OF_VERSION_IN_RETENTION,
+            NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT)
diff --git a/deepspeed/nebula/constants.py b/deepspeed/nebula/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ad876a8d1a3c996cf8ee5da5230425aed2b09bb
--- /dev/null
+++ b/deepspeed/nebula/constants.py
@@ -0,0 +1,87 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+"""
+Copyright (c) Microsoft Corporation
+Licensed under the MIT license.
+"""
+
+#########################################
+# nebula
+#########################################
+# Nebula. By default, this feature is not enabled.
+# Users can configure in ds_config.json as below example:
+NEBULA_FORMAT = '''
+nebula should be enabled as:
+"session_params": {
+  "nebula": {
+        "enabled": true,
+        "persistent_storage_path": "/foo/bar",
+        "persistent_time_interval": 100,
+        "num_of_version_in_retention": 2,
+        "enable_nebula_load": true
+    }
+}
+'''
+
+NEBULA = "nebula"
+
+NEBULA_ENABLED = "enabled"
+NEBULA_ENABLED_DEFAULT = False
+
+# There is a case where customer want to load the checkpoint saved
+# by raw torch. Because nebula cannot load torch checkpoint directly
+# as they have different folder structures to bring the gap for
+# loading(the data are totaly same in bytes for torch and enbula s
+# aving).
+# In this case, we must disable nebula load to use raw torch load.
+# Customer can just set NEBULA_ENABLE_NEBULA_LOAD to False. Then use
+# original way of deepspeed to load, i.e. set the value of "--load".
+NEBULA_ENABLE_NEBULA_LOAD = "enable_nebula_load"
+NEBULA_ENABLE_NEBULA_LOAD_DEFAULT = True
+
+# When you want to resume the previous checkpoint saved by nebula,
+# you can set NEBULA_LOAD_PATH as the parent folder of checkpoint.
+# If NEBULA_LOAD_PATH is None, the NEBULA_PERSISTENT_STORAGE_PATH
+# will be the default path to load.
+NEBULA_LOAD_PATH = "nebula_load_path"
+NEBULA_LOAD_PATH_DEFAULT = None
+
+# Nebula will save the checkpoint under NEBULA_LOAD_PATH in the
+# asynchronous way.
+NEBULA_PERSISTENT_STORAGE_PATH = "persistent_storage_path"
+NEBULA_PERSISTENT_STORAGE_PATH_DEFAULT = None
+
+# Time interval to trigger the nebula persistence.
+NEBULA_PERSISTENT_TIME_INTERVAL = "persistent_time_interval"
+NEBULA_PERSISTENT_TIME_INTERVAL_DEFAULT = 100
+
+# Checkpoint number which will be kept in memory. Let us say,
+# if the value is 2. Then we have checkpoints 1 and 2 are ready
+# now. When it comes to checkpoint 3, the 1 will be removed if
+# 1 has been persisted to disk.
+NEBULA_NUM_OF_VERSION_IN_RETENTION = "num_of_version_in_retention"
+NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT = 2
+
+# Neubla envs
+NEBULA_EXPORT_ENVS = [
+    'DLTS_JOB_ID',
+    'DLTS_NUM_WORKER',
+    'NEBULA_PERSISTENT_STORAGE_PATH',
+    'NEBULA_PERSISTENT_TIME_INTERVAL',
+    'AML_RUN_ID',
+    'AZUREML_RUN_TOKEN',
+    'AZUREML_WORKSPACE_SCOPE',
+    'AZUREML_EXPERIMENT_SCOPE',
+    'AZUREML_RUN_HISTORY_SERVICE_ENDPOINT',
+    'AZUREML_RUN_ID',
+    'NEBULA_MEMORY_BUFFER_SIZE',
+    'AZUREML_PARAMETER_ITPJOB_NAME',
+    'FC_TASKROLE_NAME',
+    'FC_TASK_INDEX',
+    'MASTER_HOST',
+    'LOCAL_HOST',
+    'AZUREML_BLOB_ACCOUNT_NAME',
+    'AZUREML_BLOB_ACCOUNT_KEY'
+]
+
+# ITP env files
+DLTS_POD_ENV_PATH = '/dlts-runtime/env/pod.env'
diff --git a/deepspeed/ops/__init__.py b/deepspeed/ops/__init__.py
old mode 100644
new mode 100755
index 698f9599bf9fe46a0f217e941e01038d2f7d6df8..efec4e62c3c973457d5608f4ba87711b53bbca5b
--- a/deepspeed/ops/__init__.py
+++ b/deepspeed/ops/__init__.py
@@ -1,3 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from . import adam
 from . import adagrad
 from . import lamb
diff --git a/deepspeed/ops/adagrad/__init__.py b/deepspeed/ops/adagrad/__init__.py
index 3067c5c903895f6773212cbaa2a00be5c7bea33e..a5ab6de0086c34f369651352592b8810435bf253 100644
--- a/deepspeed/ops/adagrad/__init__.py
+++ b/deepspeed/ops/adagrad/__init__.py
@@ -1 +1,3 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from .cpu_adagrad import DeepSpeedCPUAdagrad
diff --git a/deepspeed/ops/adagrad/cpu_adagrad.py b/deepspeed/ops/adagrad/cpu_adagrad.py
old mode 100644
new mode 100755
index e3f70a61539f173f586e50ade29050e5cb245a5a..07cdaa48c11fff906383241304e0d2152f2e35fd
--- a/deepspeed/ops/adagrad/cpu_adagrad.py
+++ b/deepspeed/ops/adagrad/cpu_adagrad.py
@@ -2,11 +2,8 @@
 Copyright 2020 The Microsoft DeepSpeed Team
 '''
 
-import math
 import torch
-import time
-from pathlib import Path
-from ..op_builder import CPUAdagradBuilder
+from deepspeed.ops.op_builder import CPUAdagradBuilder
 from deepspeed.utils.logging import should_log_le
 
 
diff --git a/deepspeed/ops/adam/__init__.py b/deepspeed/ops/adam/__init__.py
old mode 100644
new mode 100755
index 6ab6cbd37f35205f7f9f5fd1be2bbc52ea30febc..111d3175f89e45656d8a543708ff5f5c276a8f97
--- a/deepspeed/ops/adam/__init__.py
+++ b/deepspeed/ops/adam/__init__.py
@@ -1,2 +1,4 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from .cpu_adam import DeepSpeedCPUAdam
 from .fused_adam import FusedAdam
diff --git a/deepspeed/ops/adam/cpu_adam.py b/deepspeed/ops/adam/cpu_adam.py
old mode 100644
new mode 100755
index 1c7dd5d3f61d690ae10e0c3a1987f1d8d0cf13dc..04c5ac34cbe2a1682fa04d7de392a8e6b14b09e7
--- a/deepspeed/ops/adam/cpu_adam.py
+++ b/deepspeed/ops/adam/cpu_adam.py
@@ -2,12 +2,11 @@
 Copyright 2020 The Microsoft DeepSpeed Team
 '''
 
-import math
 import torch
-import time
-from pathlib import Path
-from ..op_builder import CPUAdamBuilder
+from cpuinfo import get_cpu_info
+from deepspeed.utils import logger
 from deepspeed.utils.logging import should_log_le
+from deepspeed.ops.op_builder import CPUAdamBuilder
 
 
 class DeepSpeedCPUAdam(torch.optim.Optimizer):
@@ -76,6 +75,20 @@ class DeepSpeedCPUAdam(torch.optim.Optimizer):
                             amsgrad=amsgrad)
         super(DeepSpeedCPUAdam, self).__init__(model_params, default_args)
 
+        cpu_info = get_cpu_info()
+        self.cpu_vendor = cpu_info["vendor_id_raw"].lower(
+        ) if "vendor_id_raw" in cpu_info else "unknown"
+        if "amd" in self.cpu_vendor:
+            for group_id, group in enumerate(self.param_groups):
+                for param_id, p in enumerate(group['params']):
+                    if p.dtype == torch.half:
+                        logger.warning(
+                            "FP16 params for CPUAdam may not work on AMD CPUs")
+                        break
+                else:
+                    continue
+                break
+
         self.opt_id = DeepSpeedCPUAdam.optimizer_id
         DeepSpeedCPUAdam.optimizer_id = DeepSpeedCPUAdam.optimizer_id + 1
         self.adam_w_mode = adamw_mode
diff --git a/deepspeed/ops/adam/fused_adam.py b/deepspeed/ops/adam/fused_adam.py
index 80e5bf5657035ede2557409e7aacb94ac9be79cc..169fde67e52e7aece678b34177bd40d296d0fc16 100644
--- a/deepspeed/ops/adam/fused_adam.py
+++ b/deepspeed/ops/adam/fused_adam.py
@@ -6,11 +6,11 @@ This file is adapted from fused adam in NVIDIA/apex, commit a109f85
 '''
 
 import torch
-import importlib
 from .multi_tensor_apply import MultiTensorApply
 
 multi_tensor_applier = MultiTensorApply(2048 * 32)
-from ..op_builder import FusedAdamBuilder
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import FusedAdamBuilder
 
 
 class FusedAdam(torch.optim.Optimizer):
@@ -72,7 +72,7 @@ class FusedAdam(torch.optim.Optimizer):
 
         fused_adam_cuda = FusedAdamBuilder().load()
         # Skip buffer
-        self._dummy_overflow_buf = torch.cuda.IntTensor([0])
+        self._dummy_overflow_buf = get_accelerator().IntTensor([0])
         self.multi_tensor_adam = fused_adam_cuda.multi_tensor_adam
 
     def zero_grad(self):
@@ -109,12 +109,8 @@ class FusedAdam(torch.optim.Optimizer):
             bias_correction = 1 if group['bias_correction'] else 0
             beta1, beta2 = group['betas']
 
-            # assume same step across group now to simplify things
-            # per parameter step can be easily support by making it tensor, or pass list into kernel
-            if 'step' in group:
-                group['step'] += 1
-            else:
-                group['step'] = 1
+            if 'step' not in group:
+                group['step'] = 0
 
             # create lists for multi-tensor apply
             g_16, p_16, m_16, v_16 = [], [], [], []
@@ -131,6 +127,10 @@ class FusedAdam(torch.optim.Optimizer):
                 state = self.state[p]
                 # State initialization
                 if len(state) == 0:
+                    # DeepSpeed ZeRO 3 processes each subgroup a time, so we need to keep tracking step count for each tensor separately.
+                    # While this is not an issue for ZeRO 1 & 2, since they apply a single optimizatin step to the whole param group at the same time.
+                    # In order to keep backward compatibility for the existing checkpoints, we use group['state'] to initialize state['step'] if it exists.
+                    state['step'] = group.get('step', 0)
                     # Exponential moving average of gradient values
                     state['exp_avg'] = torch.zeros_like(p.data)
                     # Exponential moving average of squared gradient values
@@ -150,6 +150,7 @@ class FusedAdam(torch.optim.Optimizer):
                     raise RuntimeError('FusedAdam only support fp16 and fp32.')
 
             if (len(g_16) > 0):
+                state['step'] += 1
                 multi_tensor_applier(self.multi_tensor_adam,
                                      self._dummy_overflow_buf,
                                      [g_16,
@@ -160,11 +161,12 @@ class FusedAdam(torch.optim.Optimizer):
                                      beta1,
                                      beta2,
                                      group['eps'],
-                                     group['step'],
+                                     state['step'],
                                      self.adam_w_mode,
                                      bias_correction,
                                      group['weight_decay'])
             if (len(g_32) > 0):
+                state['step'] += 1
                 multi_tensor_applier(self.multi_tensor_adam,
                                      self._dummy_overflow_buf,
                                      [g_32,
@@ -175,7 +177,7 @@ class FusedAdam(torch.optim.Optimizer):
                                      beta1,
                                      beta2,
                                      group['eps'],
-                                     group['step'],
+                                     state['step'],
                                      self.adam_w_mode,
                                      bias_correction,
                                      group['weight_decay'])
diff --git a/deepspeed/ops/adam/multi_tensor_apply.py b/deepspeed/ops/adam/multi_tensor_apply.py
index 173288ee76ab37ef243f680dfad4b3ff5faf2458..e837309be629d37b0155306d8e1328cbbf8ada5e 100644
--- a/deepspeed/ops/adam/multi_tensor_apply.py
+++ b/deepspeed/ops/adam/multi_tensor_apply.py
@@ -4,7 +4,6 @@ Copyright 2020 The Microsoft DeepSpeed Team
 Copyright NVIDIA/apex
 This file is adapted from NVIDIA/apex, commit a109f85
 '''
-import torch
 
 
 class MultiTensorApply(object):
diff --git a/deepspeed/ops/aio/__init__.py b/deepspeed/ops/aio/__init__.py
old mode 100644
new mode 100755
diff --git a/deepspeed/ops/csrc b/deepspeed/ops/csrc
new file mode 120000
index 0000000000000000000000000000000000000000..338652302505df522420eff27c95309b6ee102fd
--- /dev/null
+++ b/deepspeed/ops/csrc
@@ -0,0 +1 @@
+../../csrc
\ No newline at end of file
diff --git a/deepspeed/ops/csrc/adagrad/cpu_adagrad.cpp b/deepspeed/ops/csrc/adagrad/cpu_adagrad.cpp
deleted file mode 100644
index 4f2a9b69ef966599d1bd6664f79e312c9240671b..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/adagrad/cpu_adagrad.cpp
+++ /dev/null
@@ -1,227 +0,0 @@
-#include "cpu_adagrad.h"
-#include <cuda_runtime_api.h>
-#include <math.h>
-#include <omp.h>
-#include <torch/extension.h>
-#include <iostream>
-#include <memory>
-#include <type_traits>
-#include <unordered_map>
-#include "cublas_v2.h"
-#include "cuda.h"
-#include "curand.h"
-#include "custom_cuda_layers.h"
-
-static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
-
-// C++ interface
-
-void Adagrad_Optimizer::Step_1(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               __half* dev_params,
-                               bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<1>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
-#endif
-    if (_param_size > rounded_size) {
-        float step_size = -1 * _alpha;
-        __half* grads_cast_h;
-        __half* params_cast_h;
-        if (half_precision) {
-            grads_cast_h = reinterpret_cast<__half*>(grads);
-            params_cast_h = reinterpret_cast<__half*>(_params);
-        }
-        for (size_t t = rounded_size; t < _param_size; t += TILE) {
-            size_t copy_size = TILE;
-            if ((t + TILE) > _param_size) copy_size = _param_size - t;
-            size_t offset = copy_size + t;
-            if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-#pragma omp parallel for
-            for (size_t k = t; k < offset; k++) {
-                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
-                float param = half_precision ? (float)params_cast_h[k] : _params[k];
-                float momentum = grads[k];
-                float variance = _exp_avg_sq[k];
-                if (_weight_decay > 0) { grad = param * _weight_decay + grad; }
-
-                variance += grad * grad;
-
-                grad = sqrt(variance);
-                grad += _eps;
-                grad = momentum / grad;
-                param = grad * step_size + param;
-                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
-
-                if (half_precision)
-                    params_cast_h[k] = (__half)param;
-                else
-                    _params[k] = param;
-                // STORE UPDATE TERM TO GRAD'S MEMORY
-                grads[k] = grad * step_size;
-                _exp_avg_sq[k] = variance;
-            }
-            if (dev_params) {
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
-                _buf_index = !_buf_index;
-            }
-        }
-    }
-}
-
-void Adagrad_Optimizer::Step_4(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               __half* dev_params,
-                               bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<4>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_1((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int create_adagrad_optimizer(int optimizer_id,
-                             float alpha = 1e-2,
-                             float eps = 1e-8,
-                             float weight_decay = 0,
-                             bool should_log = false)
-{
-    auto opt = std::make_shared<Adagrad_Optimizer>(alpha, eps, weight_decay);
-
-    s_optimizers[optimizer_id] = opt;
-
-    if (should_log) {
-        std::string avx_type = "";
-#if defined(__AVX512__)
-        avx_type = "AVX512";
-#else
-#if defined(__AVX256__)
-        avx_type = "AVX2";
-#else
-        avx_type = "scalar";
-#endif
-#endif
-
-        printf("Adagrad Optimizer #%d is created with %s arithmetic capability.\n",
-               optimizer_id,
-               avx_type.c_str());
-        printf("Config: alpha=%f, weight_decay=%f\n", alpha, weight_decay);
-    }
-
-    return 0;
-}
-
-void Adagrad_Optimizer::Step_8(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               __half* dev_params,
-                               bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<8>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_4((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int ds_adagrad_step(int optimizer_id,
-                    size_t step,
-                    float lr,
-                    float epsilon,
-                    float weight_decay,
-                    torch::Tensor& params,
-                    torch::Tensor& grads,
-                    torch::Tensor& exp_avg_sq)
-{
-    auto params_c = params.contiguous();
-    auto grads_c = grads.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adagrad_Optimizer> opt =
-        std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step);
-    opt->update_state(lr, epsilon, weight_decay);
-    opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.size(0));
-
-    opt->SynchronizeStreams();
-    return 0;
-}
-
-int ds_adagrad_step_plus_copy(int optimizer_id,
-                              size_t step,
-                              float lr,
-                              float epsilon,
-                              float weight_decay,
-                              torch::Tensor& params,
-                              torch::Tensor& grads,
-                              torch::Tensor& exp_avg_sq,
-                              torch::Tensor& gpu_params)
-{
-    auto params_c = params.contiguous();
-    auto gpu_params_c = gpu_params.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-    auto grads_c = grads.contiguous();
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    __half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adagrad_Optimizer> opt =
-        std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step);
-    opt->update_state(lr, epsilon, weight_decay);
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_sq_ptr,
-                params_c.size(0),
-                gpu_params_ptr,
-                (params.options().dtype() == at::kHalf));
-
-    opt->SynchronizeStreams();
-    return 0;
-}
-
-int destroy_adagrad_optimizer(int optimizer_id)
-{
-    s_optimizers.erase(optimizer_id);
-
-    return 0;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("adagrad_update", &ds_adagrad_step, "DeepSpeed CPU Adagrad update (C++)");
-    m.def("adagrad_update_copy",
-          &ds_adagrad_step_plus_copy,
-          "DeepSpeed CPU Adagrad update and param copy (C++)");
-    m.def("create_adagrad", &create_adagrad_optimizer, "DeepSpeed CPU Adagrad (C++)");
-    m.def("destroy_adagrad", &destroy_adagrad_optimizer, "DeepSpeed CPU Adagrad destroy (C++)");
-}
diff --git a/deepspeed/ops/csrc/adagrad/cpu_adagrad_hip.cpp b/deepspeed/ops/csrc/adagrad/cpu_adagrad_hip.cpp
deleted file mode 100644
index 6bbe9a9ee564c9e8f081c083202326ad279eddd1..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/adagrad/cpu_adagrad_hip.cpp
+++ /dev/null
@@ -1,228 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "cpu_adagrad_hip.h"
-#include <hip/hip_runtime_api.h>
-#include <math.h>
-#include <omp.h>
-#include <torch/extension.h>
-#include <iostream>
-#include <memory>
-#include <type_traits>
-#include <unordered_map>
-#include "rocblas.h"
-#include "hip/hip_runtime.h"
-#include "hiprand/hiprand.h"
-#include "custom_hip_layers.h"
-
-static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
-
-// C++ interface
-
-void Adagrad_Optimizer::Step_1(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               __half* dev_params,
-                               bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<1>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
-#endif
-    if (_param_size > rounded_size) {
-        float step_size = -1 * _alpha;
-        __half* grads_cast_h;
-        __half* params_cast_h;
-        if (half_precision) {
-            grads_cast_h = reinterpret_cast<__half*>(grads);
-            params_cast_h = reinterpret_cast<__half*>(_params);
-        }
-        for (size_t t = rounded_size; t < _param_size; t += TILE) {
-            size_t copy_size = TILE;
-            if ((t + TILE) > _param_size) copy_size = _param_size - t;
-            size_t offset = copy_size + t;
-            if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
-#pragma omp parallel for
-            for (size_t k = t; k < offset; k++) {
-                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
-                float param = half_precision ? (float)params_cast_h[k] : _params[k];
-                float momentum = grads[k];
-                float variance = _exp_avg_sq[k];
-                if (_weight_decay > 0) { grad = param * _weight_decay + grad; }
-
-                variance += grad * grad;
-
-                grad = sqrt(variance);
-                grad += _eps;
-                grad = momentum / grad;
-                param = grad * step_size + param;
-                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
-
-                if (half_precision)
-                    params_cast_h[k] = (__half)param;
-                else
-                    _params[k] = param;
-                // STORE UPDATE TERM TO GRAD'S MEMORY
-                grads[k] = grad * step_size;
-                _exp_avg_sq[k] = variance;
-            }
-            if (dev_params) {
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
-                _buf_index = !_buf_index;
-            }
-        }
-    }
-}
-
-void Adagrad_Optimizer::Step_4(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               __half* dev_params,
-                               bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<4>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_1((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int create_adagrad_optimizer(int optimizer_id,
-                             float alpha = 1e-2,
-                             float eps = 1e-8,
-                             float weight_decay = 0,
-                             bool should_log = false)
-{
-    auto opt = std::make_shared<Adagrad_Optimizer>(alpha, eps, weight_decay);
-
-    s_optimizers[optimizer_id] = opt;
-
-    if (should_log) {
-        std::string avx_type = "";
-#if defined(__AVX512__)
-        avx_type = "AVX512";
-#else
-#if defined(__AVX256__)
-        avx_type = "AVX2";
-#else
-        avx_type = "scalar";
-#endif
-#endif
-
-        printf("Adagrad Optimizer #%d is created with %s arithmetic capability.\n",
-               optimizer_id,
-               avx_type.c_str());
-        printf("Config: alpha=%f, weight_decay=%f\n", alpha, weight_decay);
-    }
-
-    return 0;
-}
-
-void Adagrad_Optimizer::Step_8(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               __half* dev_params,
-                               bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<8>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_4((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int ds_adagrad_step(int optimizer_id,
-                    size_t step,
-                    float lr,
-                    float epsilon,
-                    float weight_decay,
-                    torch::Tensor& params,
-                    torch::Tensor& grads,
-                    torch::Tensor& exp_avg_sq)
-{
-    auto params_c = params.contiguous();
-    auto grads_c = grads.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adagrad_Optimizer> opt =
-        std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step);
-    opt->update_state(lr, epsilon, weight_decay);
-    opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.size(0));
-
-    opt->SynchronizeStreams();
-    return 0;
-}
-
-int ds_adagrad_step_plus_copy(int optimizer_id,
-                              size_t step,
-                              float lr,
-                              float epsilon,
-                              float weight_decay,
-                              torch::Tensor& params,
-                              torch::Tensor& grads,
-                              torch::Tensor& exp_avg_sq,
-                              torch::Tensor& gpu_params)
-{
-    auto params_c = params.contiguous();
-    auto gpu_params_c = gpu_params.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-    auto grads_c = grads.contiguous();
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    __half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adagrad_Optimizer> opt =
-        std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step);
-    opt->update_state(lr, epsilon, weight_decay);
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_sq_ptr,
-                params_c.size(0),
-                gpu_params_ptr,
-                (params.options().dtype() == at::kHalf));
-
-    opt->SynchronizeStreams();
-    return 0;
-}
-
-int destroy_adagrad_optimizer(int optimizer_id)
-{
-    s_optimizers.erase(optimizer_id);
-
-    return 0;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("adagrad_update", &ds_adagrad_step, "DeepSpeed CPU Adagrad update (C++)");
-    m.def("adagrad_update_copy",
-          &ds_adagrad_step_plus_copy,
-          "DeepSpeed CPU Adagrad update and param copy (C++)");
-    m.def("create_adagrad", &create_adagrad_optimizer, "DeepSpeed CPU Adagrad (C++)");
-    m.def("destroy_adagrad", &destroy_adagrad_optimizer, "DeepSpeed CPU Adagrad destroy (C++)");
-}
diff --git a/deepspeed/ops/csrc/adam/cpu_adam.cpp b/deepspeed/ops/csrc/adam/cpu_adam.cpp
deleted file mode 100644
index 727eec8182c12cdcc0dcb0df53a38918bdeae4b8..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/adam/cpu_adam.cpp
+++ /dev/null
@@ -1,292 +0,0 @@
-#include "cpu_adam.h"
-#include <cuda_runtime_api.h>
-#include <math.h>
-#include <omp.h>
-#include <torch/extension.h>
-#include <iostream>
-#include <memory>
-#include <type_traits>
-#include <unordered_map>
-#include "cublas_v2.h"
-#include "cuda.h"
-#include "curand.h"
-#include "custom_cuda_layers.h"
-
-static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
-
-// C++ interface
-
-void Adam_Optimizer::Step_1(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            __half* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<1>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size) {
-        float betta1_minus1 = 1 - _betta1;
-        float betta2_minus1 = 1 - _betta2;
-
-        float step_size = -1 * _alpha / _bias_correction1;
-        float w_decay = -1 * _alpha * _weight_decay;
-        __half* grads_cast_h;
-        __half* params_cast_h;
-        if (half_precision) {
-            grads_cast_h = reinterpret_cast<__half*>(grads);
-            params_cast_h = reinterpret_cast<__half*>(_params);
-        }
-
-        for (size_t t = rounded_size; t < _param_size; t += TILE) {
-            size_t copy_size = TILE;
-            if ((t + TILE) > _param_size) copy_size = _param_size - t;
-            size_t offset = copy_size + t;
-            if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-
-#pragma omp parallel for
-            for (size_t k = t; k < offset; k++) {
-                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
-                float param = half_precision ? (float)params_cast_h[k] : _params[k];
-                float momentum = _exp_avg[k];
-                float variance = _exp_avg_sq[k];
-                if (_weight_decay > 0 && !_adamw_mode) { grad = param * _weight_decay + grad; }
-                momentum = momentum * _betta1;
-                momentum = grad * betta1_minus1 + momentum;
-
-                variance = variance * _betta2;
-                grad = grad * grad;
-                variance = grad * betta2_minus1 + variance;
-
-                grad = sqrt(variance);
-                grad = grad * _bias_correction2 + _eps;
-                grad = momentum / grad;
-                if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
-                param = grad * step_size + param;
-                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
-
-                if (half_precision)
-                    params_cast_h[k] = (__half)param;
-                else
-                    _params[k] = param;
-                _exp_avg[k] = momentum;
-                _exp_avg_sq[k] = variance;
-            }
-            if (dev_params) {
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
-
-                _buf_index = !_buf_index;
-            }
-        }
-    }
-}
-
-void Adam_Optimizer::Step_4(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            __half* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<4>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_1((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int create_adam_optimizer(int optimizer_id,
-                          float alpha = 1e-3,
-                          float betta1 = 0.9,
-                          float betta2 = 0.999,
-                          float eps = 1e-8,
-                          float weight_decay = 0,
-                          bool adamw_mode = true,
-                          bool should_log = false)
-{
-    auto opt =
-        std::make_shared<Adam_Optimizer>(alpha, betta1, betta2, eps, weight_decay, adamw_mode);
-
-    s_optimizers[optimizer_id] = opt;
-
-    if (should_log) {
-        std::string avx_type = "";
-#if defined(__AVX512__)
-        avx_type = "AVX512";
-#else
-#if defined(__AVX256__)
-        avx_type = "AVX2";
-#else
-        avx_type = "scalar";
-#endif
-#endif
-
-        printf("Adam Optimizer #%d is created with %s arithmetic capability.\n",
-               optimizer_id,
-               avx_type.c_str());
-        printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
-               alpha,
-               betta1,
-               betta2,
-               weight_decay,
-               (int)adamw_mode);
-    }
-
-    return 0;
-}
-
-void Adam_Optimizer::Step_8(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            __half* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<8>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_4((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int ds_adam_step(int optimizer_id,
-                 size_t step,
-                 float lr,
-                 float beta1,
-                 float beta2,
-                 float epsilon,
-                 float weight_decay,
-                 bool bias_correction,
-                 torch::Tensor& params,
-                 torch::Tensor& grads,
-                 torch::Tensor& exp_avg,
-                 torch::Tensor& exp_avg_sq)
-{
-    auto params_c = params.contiguous();
-    auto grads_c = grads.contiguous();
-    auto exp_avg_c = exp_avg.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-
-    // assert(params.options().dtype() == grads.options().dtype());
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adam_Optimizer> opt =
-        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step, beta1, beta2);
-    opt->update_state(lr, epsilon, weight_decay, bias_correction);
-
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_ptr,
-                exp_avg_sq_ptr,
-                params_c.size(0),
-                nullptr,
-                (params.options().dtype() == at::kHalf));
-
-    opt->SynchronizeStreams();
-    return 0;
-}
-
-int ds_adam_step_plus_copy(int optimizer_id,
-                           size_t step,
-                           float lr,
-                           float beta1,
-                           float beta2,
-                           float epsilon,
-                           float weight_decay,
-                           bool bias_correction,
-                           torch::Tensor& params,
-                           torch::Tensor& grads,
-                           torch::Tensor& exp_avg,
-                           torch::Tensor& exp_avg_sq,
-                           torch::Tensor& gpu_params)
-{
-    auto params_c = params.contiguous();
-    auto gpu_params_c = gpu_params.contiguous();
-    auto exp_avg_c = exp_avg.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-    auto grads_c = grads.contiguous();
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    __half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
-    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adam_Optimizer> opt =
-        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step, beta1, beta2);
-    opt->update_state(lr, epsilon, weight_decay, bias_correction);
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_ptr,
-                exp_avg_sq_ptr,
-                params_c.size(0),
-                gpu_params_ptr,
-                (params.options().dtype() == at::kHalf));
-
-    opt->SynchronizeStreams();
-    return 0;
-}
-
-int destroy_adam_optimizer(int optimizer_id)
-{
-    s_optimizers.erase(optimizer_id);
-
-    return 0;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("adam_update", &ds_adam_step, "DeepSpeed CPU Adam update (C++)");
-    m.def("adam_update_copy",
-          &ds_adam_step_plus_copy,
-          "DeepSpeed CPU Adam update and param copy (C++)");
-    m.def("create_adam", &create_adam_optimizer, "DeepSpeed CPU Adam (C++)");
-    m.def("destroy_adam", &destroy_adam_optimizer, "DeepSpeed CPU Adam destroy (C++)");
-}
diff --git a/deepspeed/ops/csrc/adam/cpu_adam_hip.cpp b/deepspeed/ops/csrc/adam/cpu_adam_hip.cpp
deleted file mode 100644
index 67163979fe3311b85e6b3be3d587bdc1c498485f..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/adam/cpu_adam_hip.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "cpu_adam_hip.h"
-#include <hip/hip_runtime_api.h>
-#include <math.h>
-#include <omp.h>
-#include <torch/extension.h>
-#include <iostream>
-#include <memory>
-#include <type_traits>
-#include <unordered_map>
-#include "rocblas.h"
-#include "hip/hip_runtime.h"
-#include "hiprand/hiprand.h"
-#include "custom_hip_layers.h"
-
-static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
-
-// C++ interface
-
-void Adam_Optimizer::Step_1(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            __half* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<1>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size) {
-        float betta1_minus1 = 1 - _betta1;
-        float betta2_minus1 = 1 - _betta2;
-
-        float step_size = -1 * _alpha / _bias_correction1;
-        float w_decay = -1 * _alpha * _weight_decay;
-        __half* grads_cast_h;
-        __half* params_cast_h;
-        if (half_precision) {
-            grads_cast_h = reinterpret_cast<__half*>(grads);
-            params_cast_h = reinterpret_cast<__half*>(_params);
-        }
-
-        for (size_t t = rounded_size; t < _param_size; t += TILE) {
-            size_t copy_size = TILE;
-            if ((t + TILE) > _param_size) copy_size = _param_size - t;
-            size_t offset = copy_size + t;
-            if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
-
-#pragma omp parallel for
-            for (size_t k = t; k < offset; k++) {
-                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
-                float param = half_precision ? (float)params_cast_h[k] : _params[k];
-                float momentum = _exp_avg[k];
-                float variance = _exp_avg_sq[k];
-                if (_weight_decay > 0 && !_adamw_mode) { grad = param * _weight_decay + grad; }
-                momentum = momentum * _betta1;
-                momentum = grad * betta1_minus1 + momentum;
-
-                variance = variance * _betta2;
-                grad = grad * grad;
-                variance = grad * betta2_minus1 + variance;
-
-                grad = sqrt(variance);
-                grad = grad * _bias_correction2 + _eps;
-                grad = momentum / grad;
-                if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
-                param = grad * step_size + param;
-                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
-
-                if (half_precision)
-                    params_cast_h[k] = (__half)param;
-                else
-                    _params[k] = param;
-                _exp_avg[k] = momentum;
-                _exp_avg_sq[k] = variance;
-            }
-            if (dev_params) {
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
-
-                _buf_index = !_buf_index;
-            }
-        }
-    }
-}
-
-void Adam_Optimizer::Step_4(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            __half* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<4>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_1((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int create_adam_optimizer(int optimizer_id,
-                          float alpha = 1e-3,
-                          float betta1 = 0.9,
-                          float betta2 = 0.999,
-                          float eps = 1e-8,
-                          float weight_decay = 0,
-                          bool adamw_mode = true,
-                          bool should_log = false)
-{
-    auto opt =
-        std::make_shared<Adam_Optimizer>(alpha, betta1, betta2, eps, weight_decay, adamw_mode);
-
-    s_optimizers[optimizer_id] = opt;
-
-    if (should_log) {
-        std::string avx_type = "";
-#if defined(__AVX512__)
-        avx_type = "AVX512";
-#else
-#if defined(__AVX256__)
-        avx_type = "AVX2";
-#else
-        avx_type = "scalar";
-#endif
-#endif
-
-        printf("Adam Optimizer #%d is created with %s arithmetic capability.\n",
-               optimizer_id,
-               avx_type.c_str());
-        printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
-               alpha,
-               betta1,
-               betta2,
-               weight_decay,
-               (int)adamw_mode);
-    }
-
-    return 0;
-}
-
-void Adam_Optimizer::Step_8(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            __half* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<8>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_4((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int ds_adam_step(int optimizer_id,
-                 size_t step,
-                 float lr,
-                 float beta1,
-                 float beta2,
-                 float epsilon,
-                 float weight_decay,
-                 bool bias_correction,
-                 torch::Tensor& params,
-                 torch::Tensor& grads,
-                 torch::Tensor& exp_avg,
-                 torch::Tensor& exp_avg_sq)
-{
-    auto params_c = params.contiguous();
-    auto grads_c = grads.contiguous();
-    auto exp_avg_c = exp_avg.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-
-    // assert(params.options().dtype() == grads.options().dtype());
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adam_Optimizer> opt =
-        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step, beta1, beta2);
-    opt->update_state(lr, epsilon, weight_decay, bias_correction);
-
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_ptr,
-                exp_avg_sq_ptr,
-                params_c.size(0),
-                nullptr,
-                (params.options().dtype() == at::kHalf));
-
-    opt->SynchronizeStreams();
-    return 0;
-}
-
-int ds_adam_step_plus_copy(int optimizer_id,
-                           size_t step,
-                           float lr,
-                           float beta1,
-                           float beta2,
-                           float epsilon,
-                           float weight_decay,
-                           bool bias_correction,
-                           torch::Tensor& params,
-                           torch::Tensor& grads,
-                           torch::Tensor& exp_avg,
-                           torch::Tensor& exp_avg_sq,
-                           torch::Tensor& gpu_params)
-{
-    auto params_c = params.contiguous();
-    auto gpu_params_c = gpu_params.contiguous();
-    auto exp_avg_c = exp_avg.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-    auto grads_c = grads.contiguous();
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    __half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
-    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adam_Optimizer> opt =
-        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step, beta1, beta2);
-    opt->update_state(lr, epsilon, weight_decay, bias_correction);
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_ptr,
-                exp_avg_sq_ptr,
-                params_c.size(0),
-                gpu_params_ptr,
-                (params.options().dtype() == at::kHalf));
-
-    opt->SynchronizeStreams();
-    return 0;
-}
-
-int destroy_adam_optimizer(int optimizer_id)
-{
-    s_optimizers.erase(optimizer_id);
-
-    return 0;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("adam_update", &ds_adam_step, "DeepSpeed CPU Adam update (C++)");
-    m.def("adam_update_copy",
-          &ds_adam_step_plus_copy,
-          "DeepSpeed CPU Adam update and param copy (C++)");
-    m.def("create_adam", &create_adam_optimizer, "DeepSpeed CPU Adam (C++)");
-    m.def("destroy_adam", &destroy_adam_optimizer, "DeepSpeed CPU Adam destroy (C++)");
-}
diff --git a/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp b/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp
deleted file mode 100644
index b06531c53002c1186dac8c7e1a168bfa72e31fef..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-#include <torch/extension.h>
-
-void multi_tensor_adam_cuda(int chunk_size,
-                            at::Tensor noop_flag,
-                            std::vector<std::vector<at::Tensor>> tensor_lists,
-                            const float lr,
-                            const float beta1,
-                            const float beta2,
-                            const float epsilon,
-                            const int step,
-                            const int mode,
-                            const int bias_correction,
-                            const float weight_decay);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("multi_tensor_adam",
-          &multi_tensor_adam_cuda,
-          "Compute and apply gradient update to parameters for Adam optimizer");
-}
diff --git a/deepspeed/ops/csrc/adam/multi_tensor_adam.cu b/deepspeed/ops/csrc/adam/multi_tensor_adam.cu
deleted file mode 100644
index 3cb9763befcea663966347f23a2d9b925a7db8bd..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/adam/multi_tensor_adam.cu
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
-*/
-
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Exceptions.h>
-// Another possibility:
-// #include <torch/all.h>
-
-#include <assert.h>
-
-#include "multi_tensor_apply.cuh"
-#include "type_shim.h"
-
-#define BLOCK_SIZE 512
-#define ILP 4
-
-typedef enum {
-    ADAM_MODE_0 = 0,  // L2 regularization mode
-    ADAM_MODE_1 = 1   // Decoupled weight decay mode(AdamW)
-} adamMode_t;
-
-using MATH_T = float;
-
-template <typename T>
-struct AdamFunctor {
-    __device__ __forceinline__ void operator()(int chunk_size,
-                                               volatile int* noop_gmem,
-                                               TensorListMetadata<4>& tl,
-                                               const float beta1,
-                                               const float beta2,
-                                               const float beta1_correction,
-                                               const float beta2_correction,
-                                               const float epsilon,
-                                               const float lr,
-                                               adamMode_t mode,
-                                               const float decay)
-    {
-        // I'd like this kernel to propagate infs/nans.
-        // if(*noop_gmem == 1)
-        //   return;
-
-        int tensor_loc = tl.block_to_tensor[blockIdx.x];
-
-        // potentially use to pass in list of scalar
-        // int tensor_num = tl.start_tensor_this_launch + tensor_loc;
-
-        int chunk_idx = tl.block_to_chunk[blockIdx.x];
-        int n = tl.sizes[tensor_loc];
-
-        T* g = (T*)tl.addresses[0][tensor_loc];
-        g += chunk_idx * chunk_size;
-
-        T* p = (T*)tl.addresses[1][tensor_loc];
-        p += chunk_idx * chunk_size;
-
-        T* m = (T*)tl.addresses[2][tensor_loc];
-        m += chunk_idx * chunk_size;
-
-        T* v = (T*)tl.addresses[3][tensor_loc];
-        v += chunk_idx * chunk_size;
-
-        n -= chunk_idx * chunk_size;
-
-        // see note in multi_tensor_scale_kernel.cu
-        for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
-            MATH_T r_g[ILP];
-            MATH_T r_p[ILP];
-            MATH_T r_m[ILP];
-            MATH_T r_v[ILP];
-#pragma unroll
-            for (int ii = 0; ii < ILP; ii++) {
-                int i = i_start + threadIdx.x + ii * blockDim.x;
-                if (i < n && i < chunk_size) {
-                    r_g[ii] = g[i];
-                    r_p[ii] = p[i];
-                    r_m[ii] = m[i];
-                    r_v[ii] = v[i];
-                } else {
-                    r_g[ii] = MATH_T(0);
-                    r_p[ii] = MATH_T(0);
-                    r_m[ii] = MATH_T(0);
-                    r_v[ii] = MATH_T(0);
-                }
-            }
-#pragma unroll
-            for (int ii = 0; ii < ILP; ii++) {
-                if (mode == ADAM_MODE_0) {  // L2
-                    r_g[ii] = r_g[ii] + (decay * r_p[ii]);
-                    r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
-                    r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
-                    MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
-                    MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
-                    MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
-                    MATH_T update = next_m_unbiased / denom;
-                    r_p[ii] = r_p[ii] - (lr * update);
-                } else {  // weight decay
-                    r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
-                    r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
-                    MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
-                    MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
-                    MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
-                    MATH_T update = (next_m_unbiased / denom) + (decay * r_p[ii]);
-                    r_p[ii] = r_p[ii] - (lr * update);
-                }
-            }
-#pragma unroll
-            for (int ii = 0; ii < ILP; ii++) {
-                int i = i_start + threadIdx.x + ii * blockDim.x;
-                if (i < n && i < chunk_size) {
-                    p[i] = r_p[ii];
-                    m[i] = r_m[ii];
-                    v[i] = r_v[ii];
-                }
-            }
-        }
-    }
-};
-
-void multi_tensor_adam_cuda(int chunk_size,
-                            at::Tensor noop_flag,
-                            std::vector<std::vector<at::Tensor>> tensor_lists,
-                            const float lr,
-                            const float beta1,
-                            const float beta2,
-                            const float epsilon,
-                            const int step,
-                            const int mode,
-                            const int bias_correction,
-                            const float weight_decay)
-{
-    using namespace at;
-
-    // Handle bias correction mode
-    float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
-    if (bias_correction == 1) {
-        bias_correction1 = 1 - std::pow(beta1, step);
-        bias_correction2 = 1 - std::pow(beta2, step);
-    }
-
-    // Assume single type across p,g,m1,m2 now
-    DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
-                                   0,
-                                   "adam",
-                                   multi_tensor_apply<4>(BLOCK_SIZE,
-                                                         chunk_size,
-                                                         noop_flag,
-                                                         tensor_lists,
-                                                         AdamFunctor<scalar_t_0>(),
-                                                         beta1,
-                                                         beta2,
-                                                         bias_correction1,
-                                                         bias_correction2,
-                                                         epsilon,
-                                                         lr,
-                                                         (adamMode_t)mode,
-                                                         weight_decay);)
-
-    AT_CUDA_CHECK(cudaGetLastError());
-}
diff --git a/deepspeed/ops/csrc/adam/multi_tensor_adam.hip b/deepspeed/ops/csrc/adam/multi_tensor_adam.hip
deleted file mode 100644
index f0b7ced5c29646b793f8fa904768c091fd9d749e..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/adam/multi_tensor_adam.hip
+++ /dev/null
@@ -1,164 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-/* Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
-*/
-
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
-#include <ATen/hip/HIPContext.h>
-#include <ATen/hip/Exceptions.h>
-// Another possibility:
-// #include <torch/all.h>
-
-#include <assert.h>
-
-#include "multi_tensor_apply_hip.cuh"
-#include "type_shim_hip.h"
-
-#define BLOCK_SIZE 512
-#define ILP 4
-
-typedef enum {
-    ADAM_MODE_0 = 0,  // L2 regularization mode
-    ADAM_MODE_1 = 1   // Decoupled weight decay mode(AdamW)
-} adamMode_t;
-
-using MATH_T = float;
-
-template <typename T>
-struct AdamFunctor {
-    __device__ __forceinline__ void operator()(int chunk_size,
-                                               volatile int* noop_gmem,
-                                               TensorListMetadata<4>& tl,
-                                               const float beta1,
-                                               const float beta2,
-                                               const float beta1_correction,
-                                               const float beta2_correction,
-                                               const float epsilon,
-                                               const float lr,
-                                               adamMode_t mode,
-                                               const float decay)
-    {
-        // I'd like this kernel to propagate infs/nans.
-        // if(*noop_gmem == 1)
-        //   return;
-
-        int tensor_loc = tl.block_to_tensor[blockIdx.x];
-
-        // potentially use to pass in list of scalar
-        // int tensor_num = tl.start_tensor_this_launch + tensor_loc;
-
-        int chunk_idx = tl.block_to_chunk[blockIdx.x];
-        int n = tl.sizes[tensor_loc];
-
-        T* g = (T*)tl.addresses[0][tensor_loc];
-        g += chunk_idx * chunk_size;
-
-        T* p = (T*)tl.addresses[1][tensor_loc];
-        p += chunk_idx * chunk_size;
-
-        T* m = (T*)tl.addresses[2][tensor_loc];
-        m += chunk_idx * chunk_size;
-
-        T* v = (T*)tl.addresses[3][tensor_loc];
-        v += chunk_idx * chunk_size;
-
-        n -= chunk_idx * chunk_size;
-
-        // see note in multi_tensor_scale_kernel.cu
-        for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
-            MATH_T r_g[ILP];
-            MATH_T r_p[ILP];
-            MATH_T r_m[ILP];
-            MATH_T r_v[ILP];
-#pragma unroll
-            for (int ii = 0; ii < ILP; ii++) {
-                int i = i_start + threadIdx.x + ii * blockDim.x;
-                if (i < n && i < chunk_size) {
-                    r_g[ii] = g[i];
-                    r_p[ii] = p[i];
-                    r_m[ii] = m[i];
-                    r_v[ii] = v[i];
-                } else {
-                    r_g[ii] = MATH_T(0);
-                    r_p[ii] = MATH_T(0);
-                    r_m[ii] = MATH_T(0);
-                    r_v[ii] = MATH_T(0);
-                }
-            }
-#pragma unroll
-            for (int ii = 0; ii < ILP; ii++) {
-                if (mode == ADAM_MODE_0) {  // L2
-                    r_g[ii] = r_g[ii] + (decay * r_p[ii]);
-                    r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
-                    r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
-                    MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
-                    MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
-                    MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
-                    MATH_T update = next_m_unbiased / denom;
-                    r_p[ii] = r_p[ii] - (lr * update);
-                } else {  // weight decay
-                    r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
-                    r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
-                    MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
-                    MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
-                    MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
-                    MATH_T update = (next_m_unbiased / denom) + (decay * r_p[ii]);
-                    r_p[ii] = r_p[ii] - (lr * update);
-                }
-            }
-#pragma unroll
-            for (int ii = 0; ii < ILP; ii++) {
-                int i = i_start + threadIdx.x + ii * blockDim.x;
-                if (i < n && i < chunk_size) {
-                    p[i] = r_p[ii];
-                    m[i] = r_m[ii];
-                    v[i] = r_v[ii];
-                }
-            }
-        }
-    }
-};
-
-void multi_tensor_adam_cuda(int chunk_size,
-                            at::Tensor noop_flag,
-                            std::vector<std::vector<at::Tensor>> tensor_lists,
-                            const float lr,
-                            const float beta1,
-                            const float beta2,
-                            const float epsilon,
-                            const int step,
-                            const int mode,
-                            const int bias_correction,
-                            const float weight_decay)
-{
-    using namespace at;
-
-    // Handle bias correction mode
-    float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
-    if (bias_correction == 1) {
-        bias_correction1 = 1 - ::pow(beta1, step);
-        bias_correction2 = 1 - ::pow(beta2, step);
-    }
-
-    // Assume single type across p,g,m1,m2 now
-    DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
-                                   0,
-                                   "adam",
-                                   multi_tensor_apply<4>(BLOCK_SIZE,
-                                                         chunk_size,
-                                                         noop_flag,
-                                                         tensor_lists,
-                                                         AdamFunctor<scalar_t_0>(),
-                                                         beta1,
-                                                         beta2,
-                                                         bias_correction1,
-                                                         bias_correction2,
-                                                         epsilon,
-                                                         lr,
-                                                         (adamMode_t)mode,
-                                                         weight_decay);)
-
-    AT_CUDA_CHECK(hipGetLastError());
-}
diff --git a/deepspeed/ops/csrc/adam/multi_tensor_apply.cuh b/deepspeed/ops/csrc/adam/multi_tensor_apply.cuh
deleted file mode 100644
index 13af4b7578f6db7db066b7ff7f17edd86e8fd6d3..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/adam/multi_tensor_apply.cuh
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
-*/
-
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Exceptions.h>
-#include <c10/cuda/CUDAGuard.h>
-#include "compat.h"
-
-#include <assert.h>
-
-// #include <iostream>
-
-// This header is the one-stop shop for all your multi-tensor apply needs.
-
-// TODO:  Kernel arg size limit may be <4KB for some other cards (ie Jetson)
-constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
-constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
-
-template <int n>
-struct TensorListMetadata {
-    void* addresses[n][depth_to_max_tensors[n - 1]];
-    int sizes[depth_to_max_tensors[n - 1]];
-    unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
-    int block_to_chunk[depth_to_max_blocks[n - 1]];  // I fear this needs to be a full int.
-    int start_tensor_this_launch;
-};
-
-template <typename T, typename U, typename... ArgTypes>
-__global__ void multi_tensor_apply_kernel(int chunk_size,
-                                          volatile int* noop_flag,
-                                          T tl,
-                                          U callable,
-                                          ArgTypes... args)
-{
-    // Hand the chunk information to the user-supplied functor to process however it likes.
-    callable(chunk_size, noop_flag, tl, args...);
-}
-
-template <int depth, typename T, typename... ArgTypes>
-void multi_tensor_apply(int block_size,
-                        int chunk_size,
-                        const at::Tensor& noop_flag,
-                        const std::vector<std::vector<at::Tensor>>& tensor_lists,
-                        T callable,
-                        ArgTypes... args)
-{
-    TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
-    int len0 = tensor_lists[0].size();
-    TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
-    auto ref_device = tensor_lists[0][0].device();
-    TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
-    for (int l = 0; l < tensor_lists.size(); l++)  // No range-based for because I need indices
-    {
-        TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
-        for (int t = 0; t < tensor_lists[l].size(); t++) {
-            // TODO:  Print which tensor fails.
-            bool contiguous_memory = tensor_lists[l][t].is_contiguous();
-#ifdef VERSION_GE_1_5
-            contiguous_memory = (contiguous_memory ||
-                                 tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
-#endif
-            TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
-            TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
-                        "A tensor was not on the same device as the first tensor");
-            TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
-        }
-    }
-
-    int ntensors = tensor_lists[0].size();
-
-    TensorListMetadata<depth> tl;
-
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
-    auto stream = at::cuda::getCurrentCUDAStream();
-
-    tl.start_tensor_this_launch = 0;
-    int loc_block_info = 0;
-    int loc_tensor_info = 0;
-    for (int t = 0; t < ntensors; t++) {
-        tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
-        for (int d = 0; d < depth; d++)
-            tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
-        loc_tensor_info++;
-
-        int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
-
-        for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
-            // std::cout << chunks_this_tensor << std::endl;
-            tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
-            tl.block_to_chunk[loc_block_info] = chunk;
-            loc_block_info++;
-
-            bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
-                                 chunk == chunks_this_tensor - 1);
-            bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
-            bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
-            if (tensors_full || blocks_full || last_chunk) {
-                // using accscalar_t = acc_type<scalar_t, true>;
-                multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
-                    chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);
-
-                AT_CUDA_CHECK(cudaGetLastError());
-
-                // Reset.  The control flow possibilities here make my brain hurt.
-                loc_block_info = 0;
-                if (chunk == chunks_this_tensor - 1) {
-                    // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 <<
-                    // std::endl;
-                    loc_tensor_info = 0;
-                    tl.start_tensor_this_launch = t + 1;
-                } else {
-                    // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 <<
-                    // std::endl;
-                    tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
-                    for (int d = 0; d < depth; d++)
-                        tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
-                    loc_tensor_info = 1;
-                    tl.start_tensor_this_launch = t;
-                }
-            }
-        }
-    }
-}
diff --git a/deepspeed/ops/csrc/adam/multi_tensor_apply_hip.cuh b/deepspeed/ops/csrc/adam/multi_tensor_apply_hip.cuh
deleted file mode 100644
index 09bc9971f216f73d7e33a1b75c52d2e975115743..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/adam/multi_tensor_apply_hip.cuh
+++ /dev/null
@@ -1,129 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-/* Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
-*/
-
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
-#include <ATen/hip/HIPContext.h>
-#include <ATen/hip/Exceptions.h>
-#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
-#include "compat.h"
-
-#include <assert.h>
-
-// #include <iostream>
-
-// This header is the one-stop shop for all your multi-tensor apply needs.
-
-// TODO:  Kernel arg size limit may be <4KB for some other cards (ie Jetson)
-constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
-constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
-
-template <int n>
-struct TensorListMetadata {
-    void* addresses[n][depth_to_max_tensors[n - 1]];
-    int sizes[depth_to_max_tensors[n - 1]];
-    unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
-    int block_to_chunk[depth_to_max_blocks[n - 1]];  // I fear this needs to be a full int.
-    int start_tensor_this_launch;
-};
-
-template <typename T, typename U, typename... ArgTypes>
-__global__ void multi_tensor_apply_kernel(int chunk_size,
-                                          volatile int* noop_flag,
-                                          T tl,
-                                          U callable,
-                                          ArgTypes... args)
-{
-    // Hand the chunk information to the user-supplied functor to process however it likes.
-    callable(chunk_size, noop_flag, tl, args...);
-}
-
-template <int depth, typename T, typename... ArgTypes>
-void multi_tensor_apply(int block_size,
-                        int chunk_size,
-                        const at::Tensor& noop_flag,
-                        const std::vector<std::vector<at::Tensor>>& tensor_lists,
-                        T callable,
-                        ArgTypes... args)
-{
-    TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
-    int len0 = tensor_lists[0].size();
-    TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
-    auto ref_device = tensor_lists[0][0].device();
-    TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
-    for (int l = 0; l < tensor_lists.size(); l++)  // No range-based for because I need indices
-    {
-        TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
-        for (int t = 0; t < tensor_lists[l].size(); t++) {
-            // TODO:  Print which tensor fails.
-            bool contiguous_memory = tensor_lists[l][t].is_contiguous();
-#ifdef VERSION_GE_1_5
-            contiguous_memory = (contiguous_memory ||
-                                 tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
-#endif
-            TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
-            TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
-                        "A tensor was not on the same device as the first tensor");
-            TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
-        }
-    }
-
-    int ntensors = tensor_lists[0].size();
-
-    TensorListMetadata<depth> tl;
-
-    const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(tensor_lists[0][0]));
-    auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-
-    tl.start_tensor_this_launch = 0;
-    int loc_block_info = 0;
-    int loc_tensor_info = 0;
-    for (int t = 0; t < ntensors; t++) {
-        tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
-        for (int d = 0; d < depth; d++)
-            tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
-        loc_tensor_info++;
-
-        int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
-
-        for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
-            // std::cout << chunks_this_tensor << std::endl;
-            tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
-            tl.block_to_chunk[loc_block_info] = chunk;
-            loc_block_info++;
-
-            bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
-                                 chunk == chunks_this_tensor - 1);
-            bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
-            bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
-            if (tensors_full || blocks_full || last_chunk) {
-                // using accscalar_t = acc_type<scalar_t, true>;
-               hipLaunchKernelGGL(( multi_tensor_apply_kernel), dim3(loc_block_info), dim3(block_size), 0, stream, 
-                    chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);
-
-                AT_CUDA_CHECK(hipGetLastError());
-
-                // Reset.  The control flow possibilities here make my brain hurt.
-                loc_block_info = 0;
-                if (chunk == chunks_this_tensor - 1) {
-                    // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 <<
-                    // std::endl;
-                    loc_tensor_info = 0;
-                    tl.start_tensor_this_launch = t + 1;
-                } else {
-                    // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 <<
-                    // std::endl;
-                    tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
-                    for (int d = 0; d < depth; d++)
-                        tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
-                    loc_tensor_info = 1;
-                    tl.start_tensor_this_launch = t;
-                }
-            }
-        }
-    }
-}
diff --git a/deepspeed/ops/csrc/aio/common/deepspeed_aio_common.cpp b/deepspeed/ops/csrc/aio/common/deepspeed_aio_common.cpp
deleted file mode 100644
index 9e405d8e704c3c260bd8276f648b770471bcd4b7..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/common/deepspeed_aio_common.cpp
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <fcntl.h>
-#include <libaio.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <time.h>
-#include <unistd.h>
-
-#include <algorithm>
-#include <chrono>
-#include <cmath>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <limits>
-#include <numeric>
-#include <string>
-#include <vector>
-
-#include "deepspeed_aio_common.h"
-
-using namespace std;
-using namespace std::chrono;
-
-#define DEBUG_DS_AIO_PERF 0
-#define DEBUG_DS_AIO_SUBMIT_PERF 0
-
-static const std::string c_library_name = "deepspeed_aio";
-
-static void _report_aio_statistics(const char* tag,
-                                   const std::vector<std::chrono::duration<double>>& latencies)
-    __attribute__((unused));
-
-static void _report_aio_statistics(const char* tag,
-                                   const std::vector<std::chrono::duration<double>>& latencies)
-{
-    std::vector<double> lat_usec;
-    for (auto& lat : latencies) { lat_usec.push_back(lat.count() * 1e6); }
-    const auto min_lat = *(std::min_element(lat_usec.begin(), lat_usec.end()));
-    const auto max_lat = *(std::max_element(lat_usec.begin(), lat_usec.end()));
-    const auto avg_lat = std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
-
-    std::cout << c_library_name << ": latency statistics(usec) " << tag
-              << " min/max/avg = " << min_lat << " " << max_lat << " " << avg_lat << std::endl;
-}
-
-static void _get_aio_latencies(std::vector<std::chrono::duration<double>>& raw_latencies,
-                               struct deepspeed_aio_latency_t& summary_latencies)
-{
-    std::vector<double> lat_usec;
-    for (auto& lat : raw_latencies) { lat_usec.push_back(lat.count() * 1e6); }
-    summary_latencies._min_usec = *(std::min_element(lat_usec.begin(), lat_usec.end()));
-    summary_latencies._max_usec = *(std::max_element(lat_usec.begin(), lat_usec.end()));
-    summary_latencies._avg_usec =
-        std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
-}
-
-static void _do_io_submit_singles(const long long int n_iocbs,
-                                  const long long int iocb_index,
-                                  std::unique_ptr<aio_context>& aio_ctxt,
-                                  std::vector<std::chrono::duration<double>>& submit_times)
-{
-    for (auto i = 0; i < n_iocbs; ++i) {
-        const auto st = std::chrono::high_resolution_clock::now();
-        const auto submit_ret = io_submit(aio_ctxt->_io_ctxt, 1, aio_ctxt->_iocbs.data() + i);
-        submit_times.push_back(std::chrono::high_resolution_clock::now() - st);
-#if DEBUG_DS_AIO_SUBMIT_PERF
-        printf("submit(usec) %f io_index=%lld buf=%p len=%lu off=%llu \n",
-               submit_times.back().count() * 1e6,
-               iocb_index,
-               aio_ctxt->_iocbs[i]->u.c.buf,
-               aio_ctxt->_iocbs[i]->u.c.nbytes,
-               aio_ctxt->_iocbs[i]->u.c.offset);
-#endif
-        assert(submit_ret > 0);
-    }
-}
-
-static void _do_io_submit_block(const long long int n_iocbs,
-                                const long long int iocb_index,
-                                std::unique_ptr<aio_context>& aio_ctxt,
-                                std::vector<std::chrono::duration<double>>& submit_times)
-{
-    const auto st = std::chrono::high_resolution_clock::now();
-    const auto submit_ret = io_submit(aio_ctxt->_io_ctxt, n_iocbs, aio_ctxt->_iocbs.data());
-    submit_times.push_back(std::chrono::high_resolution_clock::now() - st);
-#if DEBUG_DS_AIO_SUBMIT_PERF
-    printf("submit(usec) %f io_index=%lld nr=%lld buf=%p len=%lu off=%llu \n",
-           submit_times.back().count() * 1e6,
-           iocb_index,
-           n_iocbs,
-           aio_ctxt->_iocbs[0]->u.c.buf,
-           aio_ctxt->_iocbs[0]->u.c.nbytes,
-           aio_ctxt->_iocbs[0]->u.c.offset);
-#endif
-    assert(submit_ret > 0);
-}
-
-static int _do_io_complete(const long long int min_completes,
-                           const long long int max_completes,
-                           std::unique_ptr<aio_context>& aio_ctxt,
-                           std::vector<std::chrono::duration<double>>& reap_times)
-{
-    const auto start_time = std::chrono::high_resolution_clock::now();
-    const auto n_completes = io_getevents(
-        aio_ctxt->_io_ctxt, min_completes, max_completes, aio_ctxt->_io_events.data(), nullptr);
-    reap_times.push_back(std::chrono::high_resolution_clock::now() - start_time);
-
-    assert(n_completes >= min_completes);
-    return n_completes;
-}
-
-void do_aio_operation_sequential(const bool read_op,
-                                 std::unique_ptr<aio_context>& aio_ctxt,
-                                 std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                                 deepspeed_aio_config_t* config,
-                                 deepspeed_aio_perf_t* perf)
-{
-    struct io_prep_context prep_ctxt(read_op, xfer_ctxt, aio_ctxt->_block_size, &aio_ctxt->_iocbs);
-
-    const auto num_io_blocks = static_cast<long long int>(
-        ceil(static_cast<double>(xfer_ctxt->_num_bytes) / aio_ctxt->_block_size));
-#if DEBUG_DS_AIO_PERF
-    const auto io_op_name = std::string(read_op ? "read" : "write");
-    std::cout << c_library_name << ": start " << io_op_name << " " << xfer_ctxt->_num_bytes
-              << " bytes with " << num_io_blocks << " io blocks" << std::endl;
-#endif
-
-    std::vector<std::chrono::duration<double>> submit_times;
-    std::vector<std::chrono::duration<double>> reap_times;
-    const auto max_queue_bytes =
-        static_cast<long long int>(aio_ctxt->_queue_depth * aio_ctxt->_block_size);
-
-    auto start = std::chrono::high_resolution_clock::now();
-    for (long long iocb_index = 0; iocb_index < num_io_blocks;
-         iocb_index += aio_ctxt->_queue_depth) {
-        const auto start_offset = iocb_index * aio_ctxt->_block_size;
-        const auto start_buffer = (char*)xfer_ctxt->_mem_buffer + start_offset;
-        const auto n_iocbs =
-            min(static_cast<long long>(aio_ctxt->_queue_depth), (num_io_blocks - iocb_index));
-        const auto num_bytes = min(max_queue_bytes, (xfer_ctxt->_num_bytes - start_offset));
-        prep_ctxt.prep_iocbs(n_iocbs, num_bytes, start_buffer, start_offset);
-
-        if (config->_single_submit) {
-            _do_io_submit_singles(n_iocbs, iocb_index, aio_ctxt, submit_times);
-        } else {
-            _do_io_submit_block(n_iocbs, iocb_index, aio_ctxt, submit_times);
-        }
-
-        _do_io_complete(n_iocbs, n_iocbs, aio_ctxt, reap_times);
-    }
-    const std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
-
-    if (perf) {
-        _get_aio_latencies(submit_times, perf->_submit);
-        _get_aio_latencies(reap_times, perf->_complete);
-        perf->_e2e_usec = elapsed.count() * 1e6;
-        perf->_e2e_rate_GB = (xfer_ctxt->_num_bytes / elapsed.count() / 1e9);
-    }
-
-#if DEBUG_DS_AIO_PERF
-    _report_aio_statistics("submit", submit_times);
-    _report_aio_statistics("complete", reap_times);
-#endif
-
-#if DEBUG_DS_AIO_PERF
-    std::cout << c_library_name << ": runtime(usec) " << elapsed.count() * 1e6
-              << " rate(GB/sec) = " << (xfer_ctxt->_num_bytes / elapsed.count() / 1e9) << std::endl;
-#endif
-
-#if DEBUG_DS_AIO_PERF
-    std::cout << c_library_name << ": finish " << io_op_name << " " << xfer_ctxt->_num_bytes
-              << " bytes " << std::endl;
-#endif
-}
-
-void do_aio_operation_overlap(const bool read_op,
-                              std::unique_ptr<aio_context>& aio_ctxt,
-                              std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                              deepspeed_aio_config_t* config,
-                              deepspeed_aio_perf_t* perf)
-{
-    struct io_prep_generator io_gen(read_op, xfer_ctxt, aio_ctxt->_block_size);
-
-#if DEBUG_DS_AIO_PERF
-    const auto io_op_name = std::string(read_op ? "read" : "write");
-    std::cout << c_library_name << ": start " << io_op_name << " " << xfer_ctxt->_num_bytes
-              << " bytes with " << io_gen._num_io_blocks << " io blocks" << std::endl;
-#endif
-
-    std::vector<std::chrono::duration<double>> submit_times;
-    std::vector<std::chrono::duration<double>> reap_times;
-
-    auto request_iocbs = aio_ctxt->_queue_depth;
-    auto n_pending_iocbs = 0;
-    const auto min_completes = 1;
-    auto start = std::chrono::high_resolution_clock::now();
-    while (true) {
-        const auto n_iocbs = io_gen.prep_iocbs(request_iocbs - n_pending_iocbs, &aio_ctxt->_iocbs);
-        if (n_iocbs > 0) {
-            if (config->_single_submit) {
-                _do_io_submit_singles(
-                    n_iocbs, (io_gen._next_iocb_index - n_iocbs), aio_ctxt, submit_times);
-            } else {
-                _do_io_submit_block(
-                    n_iocbs, (io_gen._next_iocb_index - n_iocbs), aio_ctxt, submit_times);
-            }
-        }
-
-        n_pending_iocbs += n_iocbs;
-        assert(n_pending_iocbs <= aio_ctxt->_queue_depth);
-
-        if (n_pending_iocbs == 0) { break; }
-
-        const auto n_complete =
-            _do_io_complete(min_completes, n_pending_iocbs, aio_ctxt, reap_times);
-        n_pending_iocbs -= n_complete;
-    }
-
-    const std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
-
-    if (perf) {
-        _get_aio_latencies(submit_times, perf->_submit);
-        _get_aio_latencies(reap_times, perf->_complete);
-        perf->_e2e_usec = elapsed.count() * 1e6;
-        perf->_e2e_rate_GB = (xfer_ctxt->_num_bytes / elapsed.count() / 1e9);
-    }
-
-#if DEBUG_DS_AIO_PERF
-    _report_aio_statistics("submit", submit_times);
-    _report_aio_statistics("complete", reap_times);
-#endif
-
-#if DEBUG_DS_AIO_PERF
-    std::cout << c_library_name << ": runtime(usec) " << elapsed.count() * 1e6
-              << " rate(GB/sec) = " << (xfer_ctxt->_num_bytes / elapsed.count() / 1e9) << std::endl;
-#endif
-
-#if DEBUG_DS_AIO_PERF
-    std::cout << c_library_name << ": finish " << io_op_name << " " << xfer_ctxt->_num_bytes
-              << " bytes " << std::endl;
-#endif
-}
-
-void report_file_error(const char* filename, const std::string file_op, const int error_code)
-{
-    std::string err_msg = file_op + std::string(" failed on ") + std::string(filename) +
-                          " error = " + std::to_string(error_code);
-    std::cerr << c_library_name << ":  " << err_msg << std::endl;
-}
-
-int open_file(const char* filename, const bool read_op)
-{
-    const int flags = read_op ? (O_RDONLY | __O_DIRECT) : (O_WRONLY | O_CREAT | __O_DIRECT);
-    const int mode = 0600;
-    const auto fd = open(filename, flags, mode);
-    if (fd == -1) {
-        const auto error_code = errno;
-        const auto error_msg = read_op ? " open for read " : " open for write ";
-        report_file_error(filename, error_msg, error_code);
-        return -1;
-    }
-    return fd;
-}
-
-int regular_read(const char* filename, std::vector<char>& buffer)
-{
-    long long int num_bytes;
-    const auto f_size = get_file_size(filename, num_bytes);
-    assert(f_size != -1);
-    buffer.resize(num_bytes);
-    const auto fd = open(filename, O_RDONLY, 0600);
-    assert(fd != -1);
-    long long int read_bytes = 0;
-    auto r = 0;
-    do {
-        const auto buffer_ptr = buffer.data() + read_bytes;
-        const auto bytes_to_read = num_bytes - read_bytes;
-        r = read(fd, buffer_ptr, bytes_to_read);
-        read_bytes += r;
-    } while (r > 0);
-
-    if (read_bytes != num_bytes) {
-        std::cerr << "read error "
-                  << " read_bytes (read) = " << read_bytes << " num_bytes (fstat) = " << num_bytes
-                  << std::endl;
-    }
-    assert(read_bytes == num_bytes);
-    close(fd);
-    return 0;
-}
-
-static bool _validate_buffer(const char* filename, void* aio_buffer, const long long int num_bytes)
-{
-    std::vector<char> regular_buffer;
-    const auto reg_ret = regular_read(filename, regular_buffer);
-    assert(0 == reg_ret);
-    std::cout << "regular read of " << filename << " returned " << regular_buffer.size() << " bytes"
-              << std::endl;
-
-    if (static_cast<long long int>(regular_buffer.size()) != num_bytes) { return false; }
-
-    return (0 == memcmp(aio_buffer, regular_buffer.data(), regular_buffer.size()));
-}
-
-bool validate_aio_operation(const bool read_op,
-                            const char* filename,
-                            void* aio_buffer,
-                            const long long int num_bytes)
-{
-    const auto msg_suffix = std::string("deepspeed_aio_") +
-                            std::string(read_op ? "read()" : "write()") +
-                            std::string("using read()");
-
-    if (false == _validate_buffer(filename, aio_buffer, num_bytes)) {
-        std::cout << "Fail: correctness of " << msg_suffix << std::endl;
-        return false;
-    }
-
-    std::cout << "Pass: correctness of  " << msg_suffix << std::endl;
-    return true;
-}
diff --git a/deepspeed/ops/csrc/aio/common/deepspeed_aio_common.h b/deepspeed/ops/csrc/aio/common/deepspeed_aio_common.h
deleted file mode 100644
index cc62d33765c804e88816791c72a3477278738e76..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/common/deepspeed_aio_common.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <deepspeed_aio_utils.h>
-#include <stdlib.h>
-#include <memory>
-#include <string>
-
-using namespace std;
-
-void do_aio_operation_sequential(const bool read_op,
-                                 std::unique_ptr<aio_context>& aio_ctxt,
-                                 std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                                 deepspeed_aio_config_t* config,
-                                 deepspeed_aio_perf_t* perf);
-
-void do_aio_operation_overlap(const bool read_op,
-                              std::unique_ptr<aio_context>& aio_ctxt,
-                              std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                              deepspeed_aio_config_t* config,
-                              deepspeed_aio_perf_t* perf);
-
-int open_file(const char* filename, const bool read_op);
-
-void report_file_error(const char* filename, const std::string file_op, const int error_code);
-
-int regular_read(const char* filename, std::vector<char>& buffer);
-
-bool validate_aio_operation(const bool read_op,
-                            const char* filename,
-                            void* aio_buffer,
-                            const long long int num_bytes);
diff --git a/deepspeed/ops/csrc/aio/common/deepspeed_aio_types.cpp b/deepspeed/ops/csrc/aio/common/deepspeed_aio_types.cpp
deleted file mode 100644
index e5811bb91149fad40422692ac7cde6f9348e0029..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/common/deepspeed_aio_types.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <cmath>
-
-#include "deepspeed_aio_utils.h"
-
-using namespace std;
-
-const int c_block_size = 128 * 1024;
-const int c_io_queue_depth = 8;
-
-deepspeed_aio_config_t::deepspeed_aio_config_t()
-    : _block_size(c_block_size),
-      _queue_depth(c_io_queue_depth),
-      _single_submit(false),
-      _overlap_events(false),
-      _lock_memory(false)
-{
-}
-
-deepspeed_aio_config_t::deepspeed_aio_config_t(const int block_size,
-                                               const int queue_depth,
-                                               const bool single_submit,
-                                               const bool overlap_events,
-                                               const bool lock_memory)
-    : _block_size(block_size),
-      _queue_depth(queue_depth),
-      _single_submit(single_submit),
-      _overlap_events(overlap_events),
-      _lock_memory(lock_memory)
-{
-}
-
-void deepspeed_aio_latency_t::dump(const std::string tag)
-{
-    std::cout << tag << _min_usec << " " << _max_usec << " " << _avg_usec << " " << std::endl;
-}
-
-void deepspeed_aio_latency_t::accumulate(const struct deepspeed_aio_latency_t& other)
-{
-    _min_usec += other._min_usec;
-    _max_usec += other._max_usec;
-    _avg_usec += other._avg_usec;
-}
-
-void deepspeed_aio_latency_t::scale(const float scaler)
-{
-    _min_usec *= scaler;
-    _max_usec *= scaler;
-    _avg_usec *= scaler;
-}
-
-aio_context::aio_context(const int block_size, const int queue_depth)
-{
-    _block_size = block_size;
-    _queue_depth = queue_depth;
-    for (auto i = 0; i < queue_depth; ++i) {
-        _iocbs.push_back((struct iocb*)calloc(1, sizeof(struct iocb)));
-    }
-    _io_events.resize(queue_depth);
-    io_queue_init(queue_depth, &_io_ctxt);
-}
-
-aio_context::~aio_context()
-{
-    for (auto& iocb : _iocbs) { free(iocb); }
-    _io_events.resize(0);
-    io_queue_release(_io_ctxt);
-}
diff --git a/deepspeed/ops/csrc/aio/common/deepspeed_aio_types.h b/deepspeed/ops/csrc/aio/common/deepspeed_aio_types.h
deleted file mode 100644
index be3b352d6be20733f7e03a821378a648384be0b5..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/common/deepspeed_aio_types.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <libaio.h>
-#include <stdlib.h>
-
-#include <string>
-#include <vector>
-
-using namespace std;
-
-struct deepspeed_aio_latency_t {
-    double _min_usec;
-    double _max_usec;
-    double _avg_usec;
-
-    void dump(const std::string tag);
-    void accumulate(const deepspeed_aio_latency_t&);
-    void scale(const float value);
-};
-
-struct deepspeed_aio_perf_t {
-    deepspeed_aio_latency_t _submit;
-    deepspeed_aio_latency_t _complete;
-    double _e2e_usec;
-    double _e2e_rate_GB;
-};
-
-struct deepspeed_aio_config_t {
-    const int _block_size;
-    const int _queue_depth;
-    const bool _single_submit;
-    const bool _overlap_events;
-    const bool _lock_memory;
-
-    deepspeed_aio_config_t();
-    deepspeed_aio_config_t(const int block_size,
-                           const int queue_depth,
-                           const bool single_submit,
-                           const bool overlap_events,
-                           const bool lock_memory);
-};
-
-struct aio_context {
-    io_context_t _io_ctxt;
-    std::vector<struct io_event> _io_events;
-    std::vector<struct iocb*> _iocbs;
-    int _block_size;
-    int _queue_depth;
-
-    aio_context(const int block_size, const int queue_depth);
-    ~aio_context();
-};
diff --git a/deepspeed/ops/csrc/aio/common/deepspeed_aio_utils.cpp b/deepspeed/ops/csrc/aio/common/deepspeed_aio_utils.cpp
deleted file mode 100644
index 200c7030f120366c2e2a45cb6cc20785ec4518fd..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/common/deepspeed_aio_utils.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <cmath>
-
-#include "deepspeed_aio_utils.h"
-
-using namespace std;
-
-const int c_block_size = 128 * 1024;
-const int c_io_queue_depth = 8;
-
-io_xfer_ctxt::io_xfer_ctxt(const int fd,
-                           const long long int file_offset,
-                           const long long int num_bytes,
-                           const void* buffer)
-    : _fd(fd), _base_offset(file_offset), _mem_buffer(buffer), _num_bytes(num_bytes)
-{
-}
-
-io_prep_context::io_prep_context(const bool read_op,
-                                 const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                                 const size_t block_size,
-                                 const std::vector<struct iocb*>* iocbs)
-    : _read_op(read_op), _xfer_ctxt(xfer_ctxt), _block_size(block_size), _iocbs(iocbs)
-{
-}
-
-void io_prep_context::prep_iocbs(const int n_iocbs,
-                                 const size_t num_bytes,
-                                 const void* start_buffer,
-                                 const long long int start_offset)
-{
-    assert(static_cast<size_t>(n_iocbs) <= _iocbs->size());
-    for (auto i = 0; i < n_iocbs; ++i) {
-        const auto shift = i * _block_size;
-        const auto xfer_buffer = (char*)start_buffer + _xfer_ctxt->_base_offset + shift;
-        const auto xfer_offset = _xfer_ctxt->_base_offset + start_offset + shift;
-        auto byte_count = _block_size;
-        if ((shift + _block_size) > num_bytes) { byte_count = num_bytes - shift; }
-
-        if (_read_op) {
-            io_prep_pread(_iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, byte_count, xfer_offset);
-        } else {
-            io_prep_pwrite(_iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, byte_count, xfer_offset);
-        }
-    }
-}
-
-io_prep_generator::io_prep_generator(const bool read_op,
-                                     const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                                     const size_t block_size)
-    : _read_op(read_op),
-      _xfer_ctxt(xfer_ctxt),
-      _block_size(block_size),
-      _remaining_bytes(xfer_ctxt->_num_bytes),
-      _next_iocb_index(0)
-{
-    _num_io_blocks =
-        static_cast<long long int>(ceil(static_cast<double>(xfer_ctxt->_num_bytes) / block_size));
-    _remaining_io_blocks = _num_io_blocks;
-}
-
-int io_prep_generator::prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs)
-{
-    if ((_remaining_bytes) == 0 || (_remaining_io_blocks == 0)) {
-        assert(static_cast<long long int>(_remaining_bytes) == _remaining_io_blocks);
-        return 0;
-    }
-
-    assert(static_cast<size_t>(n_iocbs) <= iocbs->size());
-
-    auto actual_n_iocbs = min(static_cast<long long int>(n_iocbs), _remaining_io_blocks);
-    for (auto i = 0; i < actual_n_iocbs; ++i, ++_next_iocb_index) {
-        const auto xfer_offset = _xfer_ctxt->_base_offset + (_next_iocb_index * _block_size);
-        const auto xfer_buffer = (char*)_xfer_ctxt->_mem_buffer + xfer_offset;
-        const auto num_bytes = min(static_cast<long long int>(_block_size), _remaining_bytes);
-
-        if (_read_op) {
-            io_prep_pread(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
-        } else {
-            io_prep_pwrite(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
-        }
-        _remaining_bytes -= num_bytes;
-    }
-    _remaining_io_blocks -= actual_n_iocbs;
-
-    return actual_n_iocbs;
-}
-
-int get_file_size(const char* filename, long long int& size)
-{
-    struct stat st;
-    if (stat(filename, &st) == -1) { return -1; }
-    size = st.st_size;
-    return 0;
-}
-
-void* ds_page_aligned_alloc(const size_t size, const bool lock)
-{
-    void* ptr;
-    int retval;
-
-    retval = posix_memalign(&ptr, (size_t)sysconf(_SC_PAGESIZE), size);
-    if (retval) { return nullptr; }
-
-    if (lock == false) { return ptr; }
-
-    auto mlock_ret = mlock(ptr, size);
-    if (mlock_ret != 0) {
-        auto mlock_error = errno;
-        printf("mlock failed with %d %s\n", mlock_error, strerror(mlock_error));
-
-        free(ptr);
-        return nullptr;
-    }
-
-    return ptr;
-}
diff --git a/deepspeed/ops/csrc/aio/common/deepspeed_aio_utils.h b/deepspeed/ops/csrc/aio/common/deepspeed_aio_utils.h
deleted file mode 100644
index 6c5952749dd33d5e0059c209dc14ea755424da23..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/common/deepspeed_aio_utils.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#pragma once
-
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <fcntl.h>
-#include <libaio.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#include <deepspeed_aio_types.h>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <vector>
-
-struct io_xfer_ctxt {
-    const int _fd;
-    const long long int _base_offset;
-    const void* _mem_buffer;
-    const long long int _num_bytes;
-
-    io_xfer_ctxt(const int fd,
-                 const long long int file_offset,
-                 const long long int num_bytes,
-                 const void* buffer);
-};
-
-struct io_prep_context {
-    const bool _read_op;
-    const std::unique_ptr<io_xfer_ctxt>& _xfer_ctxt;
-    const size_t _block_size;
-    const std::vector<struct iocb*>* _iocbs;
-
-    io_prep_context(const bool read_op,
-                    const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                    const size_t block_size,
-                    const std::vector<struct iocb*>* iocbs);
-
-    void prep_iocbs(const int n_iocbs,
-                    const size_t num_bytes,
-                    const void* start_buffer,
-                    const long long int start_offset);
-};
-
-struct io_prep_generator {
-    const bool _read_op;
-    const std::unique_ptr<io_xfer_ctxt>& _xfer_ctxt;
-    const size_t _block_size;
-
-    long long int _remaining_bytes;
-    long long int _num_io_blocks;
-    long long int _remaining_io_blocks;
-    long long int _next_iocb_index;
-
-    io_prep_generator(const bool read_op,
-                      const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                      const size_t block_size);
-
-    int prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs);
-};
-
-void* ds_page_aligned_alloc(const size_t size, const bool lock = false);
-
-int get_file_size(const char* filename, long long int& size);
diff --git a/deepspeed/ops/csrc/aio/py_lib/deepspeed_aio_thread.cpp b/deepspeed/ops/csrc/aio/py_lib/deepspeed_aio_thread.cpp
deleted file mode 100644
index a2670fb7b4cbc6635c25d71e9b5f9bfd265f59c9..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_lib/deepspeed_aio_thread.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include "deepspeed_aio_thread.h"
-
-using namespace std;
-
-io_op_desc_t::io_op_desc_t(const bool read_op,
-                           const torch::Tensor& buffer,
-                           const int fd,
-                           const char* filename,
-                           const long long int num_bytes,
-                           const bool validate)
-    : _read_op(read_op),
-      _buffer(buffer),
-      _fd(fd),
-      _filename(filename),
-      _num_bytes(num_bytes),
-      _validate(validate)
-{
-    _cpu_buffer = _buffer.is_cuda() ? _buffer.to(torch::kCPU).pin_memory() : _buffer;
-    _contiguous_buffer = _cpu_buffer.contiguous();
-}
-
-char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
-
-void io_op_desc_t::fini()
-{
-    if (_read_op && _buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
-}
-
-deepspeed_aio_thread_t::deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config)
-    : _tid(tid),
-      _aio_config(aio_config),
-      _aio_ctxt(new aio_context(aio_config._block_size, aio_config._queue_depth)),
-      _time_to_exit(false)
-{
-}
-
-deepspeed_aio_thread_t::~deepspeed_aio_thread_t() {}
-
-void deepspeed_aio_thread_t::run()
-{
-    while (true) {
-        std::shared_ptr<struct io_op_desc_t> next_io_op = nullptr;
-
-        {
-            std::unique_lock<std::mutex> lock(_work_sync._mutex);
-            _work_sync._cond_var.wait(lock,
-                                      [this] { return (!_work_queue.empty() || _time_to_exit); });
-            if (!_work_queue.empty()) {
-                next_io_op = _work_queue.front();
-                _work_queue.pop();
-            }
-        }
-
-        if (next_io_op) {
-            const auto base_offset = next_io_op->_num_bytes * _tid;
-
-            std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(
-                next_io_op->_fd, base_offset, next_io_op->_num_bytes, next_io_op->data_ptr()));
-
-            if (_aio_config._overlap_events) {
-                do_aio_operation_overlap(
-                    next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-            } else {
-                do_aio_operation_sequential(
-                    next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-            }
-
-            {
-                std::lock_guard<std::mutex> lock(_complete_sync._mutex);
-                _complete_queue.push(next_io_op);
-            }
-            _complete_sync._cond_var.notify_one();
-        }
-
-        if (_time_to_exit) { break; }
-    }
-}
diff --git a/deepspeed/ops/csrc/aio/py_lib/deepspeed_aio_thread.h b/deepspeed/ops/csrc/aio/py_lib/deepspeed_aio_thread.h
deleted file mode 100644
index d1cfcab8bfc2446921422b83efa100444ce0dd31..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_lib/deepspeed_aio_thread.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <condition_variable>
-#include <memory>
-#include <queue>
-#include "deepspeed_py_aio.h"
-
-struct io_op_desc_t {
-    const bool _read_op;
-    torch::Tensor _buffer;
-    int _fd;
-    const std::string _filename;
-    const long long int _num_bytes;
-    torch::Tensor _cpu_buffer;
-    torch::Tensor _contiguous_buffer;
-    const bool _validate;
-
-    io_op_desc_t(const bool read_op,
-                 const torch::Tensor& buffer,
-                 const int fd,
-                 const char* filename,
-                 const long long int num_bytes,
-                 const bool validate);
-
-    char* data_ptr() const;
-    void fini();
-};
-
-struct thread_sync_t {
-    std::mutex _mutex;
-    std::condition_variable _cond_var;
-};
-
-struct deepspeed_aio_thread_t {
-    const int _tid;
-    deepspeed_aio_config_t& _aio_config;
-
-    std::unique_ptr<struct aio_context> _aio_ctxt;
-    std::queue<std::shared_ptr<struct io_op_desc_t>> _work_queue;
-    std::queue<std::shared_ptr<struct io_op_desc_t>> _complete_queue;
-
-    bool _time_to_exit;
-
-    struct thread_sync_t _work_sync;
-    struct thread_sync_t _complete_sync;
-
-    deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config);
-
-    ~deepspeed_aio_thread_t();
-
-    void run();
-};
diff --git a/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio.cpp b/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio.cpp
deleted file mode 100644
index 49ff1f240c433288a0e12c64389887c65926ad83..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#include <cassert>
-#include <chrono>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "deepspeed_py_aio.h"
-
-using namespace std;
-using namespace std::chrono;
-
-#define DEBUG_DS_AIO_READ 0
-#define DEBUG_DS_AIO_WRITE 0
-
-static const std::string c_library_name = "deepspeed_aio";
-
-int deepspeed_py_aio_write(const torch::Tensor& buffer,
-                           const char* filename,
-                           const int block_size,
-                           const int queue_depth,
-                           const bool single_submit,
-                           const bool overlap_events,
-                           const bool validate)
-{
-    const auto start_time = std::chrono::high_resolution_clock::now();
-    deepspeed_aio_config_t config(block_size, queue_depth, single_submit, overlap_events, false);
-
-    const auto fd = open_file(filename, false);
-    if (fd == -1) { return -1; }
-
-    auto write_buffer = (char*)buffer.data_ptr();
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
-    std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
-
-    if (config._overlap_events) {
-        do_aio_operation_overlap(false, aio_ctxt, xfer_ctxt, &config, nullptr);
-    } else {
-        do_aio_operation_sequential(false, aio_ctxt, xfer_ctxt, &config, nullptr);
-    }
-    const std::chrono::duration<double> aio_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-
-    close(fd);
-
-    if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
-
-    const std::chrono::duration<double> fn_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
-    return 0;
-}
-
-int deepspeed_py_aio_read(torch::Tensor& buffer,
-                          const char* filename,
-                          const int block_size,
-                          const int queue_depth,
-                          const bool single_submit,
-                          const bool overlap_events,
-                          const bool validate)
-{
-    const auto start_time = std::chrono::high_resolution_clock::now();
-    long long num_file_bytes;
-    if (-1 == get_file_size(filename, num_file_bytes)) {
-        const auto error_code = errno;
-        report_file_error(filename, " fstat for read", error_code);
-        return -1;
-    }
-
-    deepspeed_aio_config_t config(block_size, queue_depth, single_submit, overlap_events, false);
-    const auto fd = open_file(filename, true);
-    if (fd == -1) { return -1; }
-
-    auto read_buffer = (char*)buffer.data_ptr();
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
-
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
-    std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
-
-    if (config._overlap_events) {
-        do_aio_operation_overlap(true, aio_ctxt, xfer_ctxt, &config, nullptr);
-    } else {
-        do_aio_operation_sequential(true, aio_ctxt, xfer_ctxt, &config, nullptr);
-    }
-    const std::chrono::duration<double> aio_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-
-    close(fd);
-
-    if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
-
-    const std::chrono::duration<double> fn_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
-    return 0;
-}
diff --git a/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio.h b/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio.h
deleted file mode 100644
index 230d88da9763a0130554ca83c5e3b1a5d914116f..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio.h
+++ /dev/null
@@ -1,27 +0,0 @@
-
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <deepspeed_aio_common.h>
-#include <stdlib.h>
-#include <torch/extension.h>
-
-int deepspeed_py_aio_write(const torch::Tensor& buffer,
-                           const char* filename,
-                           const int block_size,
-                           const int queue_depth,
-                           const bool single_submit,
-                           const bool overlap_events,
-                           const bool validate);
-
-int deepspeed_py_aio_read(torch::Tensor& buffer,
-                          const char* filename,
-                          const int block_size,
-                          const int queue_depth,
-                          const bool single_submit,
-                          const bool overlap_events,
-                          const bool validate);
diff --git a/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
deleted file mode 100644
index 417319f8ae5ce3bead644c80c094d9df1061879a..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ /dev/null
@@ -1,282 +0,0 @@
-
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include "deepspeed_py_aio_handle.h"
-
-using namespace std;
-
-static void _start_aio_thread(std::shared_ptr<struct deepspeed_aio_thread_t> ctxt) { ctxt->run(); }
-
-deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
-                                               const int queue_depth,
-                                               const bool single_submit,
-                                               const bool overlap_events,
-                                               const int num_threads)
-    : _aio_ctxt(new aio_context(block_size, queue_depth)),
-      _single_submit(single_submit),
-      _overlap_events(overlap_events),
-      _num_threads(num_threads),
-      _aio_config(block_size, queue_depth, single_submit, overlap_events, false),
-      _num_pending_ops(0)
-{
-    for (auto i = 0; i < num_threads; ++i) {
-        _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
-    }
-
-    for (auto& ctxt : _thread_contexts) {
-        _threads.push_back(std::thread(_start_aio_thread, ctxt));
-    }
-}
-
-deepspeed_aio_handle_t::~deepspeed_aio_handle_t()
-{
-    _stop_threads();
-    for (auto& thr : _threads) { thr.join(); }
-}
-
-const int deepspeed_aio_handle_t::get_block_size() const
-{
-    return _aio_ctxt ? _aio_ctxt->_block_size : -1;
-}
-
-const int deepspeed_aio_handle_t::get_queue_depth() const
-{
-    return _aio_ctxt ? _aio_ctxt->_queue_depth : -1;
-}
-
-const bool deepspeed_aio_handle_t::get_single_submit() const { return _single_submit; }
-
-const bool deepspeed_aio_handle_t::get_overlap_events() const { return _overlap_events; }
-
-const int deepspeed_aio_handle_t::get_thread_count() const { return _num_threads; }
-
-int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate)
-{
-    const auto start_time = std::chrono::high_resolution_clock::now();
-
-    assert(_aio_ctxt);
-
-    long long num_file_bytes;
-    if (-1 == get_file_size(filename, num_file_bytes)) {
-        const auto error_code = errno;
-        report_file_error(filename, " fstat for read", error_code);
-        return -1;
-    }
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
-
-    const auto fd = open_file(filename, true);
-    if (fd == -1) { return -1; }
-
-    auto read_buffer = (char*)buffer.data_ptr();
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
-
-    if (_aio_config._overlap_events) {
-        do_aio_operation_overlap(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    } else {
-        do_aio_operation_sequential(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    }
-
-    close(fd);
-    const std::chrono::duration<double> aio_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-
-    if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
-    const std::chrono::duration<double> fn_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
-    return 0;
-}
-
-int deepspeed_aio_handle_t::write(const torch::Tensor& buffer,
-                                  const char* filename,
-                                  const bool validate)
-{
-    assert(_aio_ctxt);
-
-    const auto start_time = std::chrono::high_resolution_clock::now();
-
-    const auto fd = open_file(filename, false);
-    if (fd == -1) { return -1; }
-
-    auto write_buffer = (char*)buffer.data_ptr();
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
-
-    if (_aio_config._overlap_events) {
-        do_aio_operation_overlap(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    } else {
-        do_aio_operation_sequential(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    }
-    const std::chrono::duration<double> aio_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-
-    close(fd);
-
-    if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
-
-    const std::chrono::duration<double> fn_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
-    return 0;
-}
-
-void deepspeed_aio_handle_t::_schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op)
-{
-    for (auto& ctxt : _thread_contexts) {
-        {
-            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
-            ctxt->_work_queue.push(scheduled_op);
-        }
-        ctxt->_work_sync._cond_var.notify_one();
-    }
-    _num_pending_ops++;
-}
-
-std::shared_ptr<struct io_op_desc_t> deepspeed_aio_handle_t::_wait_for_aio_work()
-{
-    std::shared_ptr<struct io_op_desc_t> completed_op = nullptr;
-    for (auto& ctxt : _thread_contexts) {
-        std::unique_lock<std::mutex> lock(ctxt->_complete_sync._mutex);
-        ctxt->_complete_sync._cond_var.wait(lock,
-                                            [ctxt] { return !ctxt->_complete_queue.empty(); });
-        completed_op = ctxt->_complete_queue.front();
-        ctxt->_complete_queue.pop();
-    }
-    return completed_op;
-}
-
-void deepspeed_aio_handle_t::_stop_threads()
-{
-    assert(0 == _num_pending_ops);
-    for (auto& ctxt : _thread_contexts) {
-        {
-            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
-            ctxt->_time_to_exit = true;
-        }
-        ctxt->_work_sync._cond_var.notify_one();
-    }
-}
-
-int deepspeed_aio_handle_t::wait()
-{
-    assert(_num_pending_ops > 0);
-    auto num_completed_ops = 0;
-
-    while (_num_pending_ops > 0) {
-        auto completed_op = _wait_for_aio_work();
-
-        completed_op->fini();
-
-        close(completed_op->_fd);
-
-        if (completed_op->_validate) {
-            validate_aio_operation(completed_op->_read_op,
-                                   completed_op->_filename.c_str(),
-                                   completed_op->data_ptr(),
-                                   _num_threads * completed_op->_num_bytes);
-        }
-        --_num_pending_ops;
-        ++num_completed_ops;
-    }
-
-    return num_completed_ops;
-}
-
-bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op,
-                                                       const long long int num_bytes)
-{
-    const auto op_string = read_op ? "Read" : "Write";
-    if (num_bytes % get_thread_count()) {
-        std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
-                  << " not divisible by thread count = " << get_thread_count() << std::endl;
-        return false;
-    }
-
-    return true;
-}
-
-int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer,
-                                  const char* filename,
-                                  const bool validate,
-                                  const bool async)
-{
-    long long num_file_bytes;
-    if (-1 == get_file_size(filename, num_file_bytes)) {
-        const auto error_code = errno;
-        report_file_error(filename, " fstat for read", error_code);
-        return -1;
-    }
-    const auto buffer_bytes = static_cast<long long int>(buffer.nbytes());
-    if (buffer_bytes != num_file_bytes) {
-        std::cout << filename << ": buffer nbytes != file bytes " << buffer_bytes
-                  << " != " << num_file_bytes << std::endl;
-    }
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
-    assert((num_file_bytes % _num_threads) == 0);
-
-    if (!_is_valid_parallel_aio_op(true, num_file_bytes)) { return -1; }
-
-    const auto fd = open_file(filename, true);
-    if (fd == -1) { return -1; }
-
-    auto scheduled_op = std::make_shared<io_op_desc_t>(
-        true, buffer, fd, filename, (num_file_bytes / _num_threads), validate);
-
-    _schedule_aio_work(scheduled_op);
-
-    if (async) { return 0; }
-
-    return wait();
-}
-
-int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer,
-                                   const char* filename,
-                                   const bool validate,
-                                   const bool async)
-{
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
-    assert((num_write_bytes % _num_threads) == 0);
-
-    if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; }
-
-    const auto fd = open_file(filename, false);
-    if (fd == -1) { return -1; }
-
-    auto scheduled_op = std::make_shared<io_op_desc_t>(
-        false, buffer, fd, filename, (num_write_bytes / _num_threads), validate);
-
-    _schedule_aio_work(scheduled_op);
-
-    if (async) { return 0; }
-
-    return wait();
-}
-
-int deepspeed_aio_handle_t::sync_pread(torch::Tensor& buffer, const char* filename)
-{
-    return pread(buffer, filename, false, false);
-}
-
-int deepspeed_aio_handle_t::sync_pwrite(const torch::Tensor& buffer, const char* filename)
-{
-    return pwrite(buffer, filename, false, false);
-}
-
-int deepspeed_aio_handle_t::async_pread(torch::Tensor& buffer, const char* filename)
-{
-    return pread(buffer, filename, false, true);
-}
-
-int deepspeed_aio_handle_t::async_pwrite(const torch::Tensor& buffer, const char* filename)
-{
-    return pwrite(buffer, filename, false, true);
-}
diff --git a/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio_handle.h
deleted file mode 100644
index 22de4c3961d29abc94517b81ff38b7224822589c..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio_handle.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <condition_variable>
-#include <memory>
-#include "deepspeed_aio_thread.h"
-
-struct deepspeed_aio_handle_t {
-    std::unique_ptr<struct aio_context> _aio_ctxt;
-    const bool _single_submit;
-    const bool _overlap_events;
-    const int _num_threads;
-    deepspeed_aio_config_t _aio_config;
-
-    std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
-    std::vector<std::thread> _threads;
-    int _num_pending_ops;
-
-    deepspeed_aio_handle_t(const int block_size,
-                           const int queue_depth,
-                           const bool single_submit,
-                           const bool overlap_events,
-                           const int num_threads);
-
-    ~deepspeed_aio_handle_t();
-
-    const int get_block_size() const;
-    const int get_queue_depth() const;
-    const bool get_single_submit() const;
-    const bool get_overlap_events() const;
-    const int get_thread_count() const;
-
-    int read(torch::Tensor& buffer, const char* filename, const bool validate);
-
-    int write(const torch::Tensor& buffer, const char* filename, const bool validate);
-
-    int pread(const torch::Tensor& buffer,
-              const char* filename,
-              const bool validate,
-              const bool async);
-
-    int pwrite(const torch::Tensor& buffer,
-               const char* filename,
-               const bool validate,
-               const bool async);
-
-    int sync_pread(torch::Tensor& buffer, const char* filename);
-
-    int sync_pwrite(const torch::Tensor& buffer, const char* filename);
-
-    int async_pread(torch::Tensor& buffer, const char* filename);
-
-    int async_pwrite(const torch::Tensor& buffer, const char* filename);
-
-    int wait();
-
-    void _stop_threads();
-
-    void _schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op);
-
-    std::shared_ptr<struct io_op_desc_t> _wait_for_aio_work();
-
-    bool _is_valid_parallel_aio_op(const bool read_op, const long long int num_bytes);
-};
diff --git a/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_copy.cpp b/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_copy.cpp
deleted file mode 100644
index ee51147f9c414b184bb6ef81edd8905ca7fd4a78..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_copy.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include "deepspeed_py_copy.h"
-#include <omp.h>
-
-#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
-
-#if defined(__AVX512__) or defined(__AVX256__)
-union AVX_Data {
-#if defined(__AVX512__)
-    __m512 data;
-#else
-    __m256 data;
-#endif
-};
-#endif
-
-static void helper_memcpy_1(float* dest, float* src, size_t param_size)
-{
-    size_t rounded_size = 0;
-
-#if defined(__AVX512__) or defined(__AVX256__)
-
-    rounded_size = ROUND_DOWN(param_size, SIMD_WIDTH);
-
-    for (size_t t = 0; t < rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
-        size_t offset = copy_size + t;
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += SIMD_WIDTH) {
-            AVX_Data src_4;
-            src_4.data = SIMD_LOAD(src + i);
-
-            SIMD_STORE(dest + i, src_4.data);
-        }
-    }
-
-#endif
-
-    if (param_size > rounded_size) {
-#pragma omp parallel for
-        for (size_t k = rounded_size; k < param_size; k++) { dest[k] = src[k]; }
-    }
-}
-
-static void helper_memcpy_4(float* dest, float* src, size_t param_size)
-{
-    size_t rounded_size = 0;
-
-#if defined(__AVX512__) or defined(__AVX256__)
-
-    rounded_size = ROUND_DOWN(param_size, (SIMD_WIDTH << 2));
-
-    for (size_t t = 0; t < rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
-        size_t offset = copy_size + t;
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 2)) {
-            AVX_Data src_4[4];
-            src_4[0].data = SIMD_LOAD(src + i);
-            src_4[1].data = SIMD_LOAD(src + i + SIMD_WIDTH);
-            src_4[2].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 1));
-            src_4[3].data = SIMD_LOAD(src + i + SIMD_WIDTH * 3);
-
-            SIMD_STORE(dest + i, src_4[0].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH, src_4[1].data);
-            SIMD_STORE(dest + i + (SIMD_WIDTH << 1), src_4[2].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH * 3, src_4[3].data);
-        }
-    }
-#endif
-    if (param_size > rounded_size)
-        helper_memcpy_1((dest + rounded_size), (src + rounded_size), (param_size - rounded_size));
-}
-
-static void helper_mempcy_8(float* dest, float* src, size_t param_size)
-{
-    size_t rounded_size = 0;
-
-#if defined(__AVX512__) or defined(__AVX256__)
-
-    rounded_size = ROUND_DOWN(param_size, (SIMD_WIDTH << 2));
-
-    for (size_t t = 0; t < rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
-        size_t offset = copy_size + t;
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 3)) {
-            AVX_Data src_4[8];
-            src_4[0].data = SIMD_LOAD(src + i);
-            src_4[1].data = SIMD_LOAD(src + i + SIMD_WIDTH);
-            src_4[2].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 1));
-            src_4[3].data = SIMD_LOAD(src + i + SIMD_WIDTH * 3);
-            src_4[4].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 2));
-            src_4[5].data = SIMD_LOAD(src + i + SIMD_WIDTH * 5);
-            src_4[6].data = SIMD_LOAD(src + i + SIMD_WIDTH * 6);
-            src_4[7].data = SIMD_LOAD(src + i + SIMD_WIDTH * 7);
-
-            SIMD_STORE(dest + i, src_4[0].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH, src_4[1].data);
-            SIMD_STORE(dest + i + (SIMD_WIDTH << 1), src_4[2].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH * 3, src_4[3].data);
-            SIMD_STORE(dest + i + (SIMD_WIDTH << 2), src_4[4].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH * 5, src_4[5].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH * 6, src_4[6].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH * 7, src_4[7].data);
-        }
-    }
-#endif
-    if (param_size > rounded_size)
-        helper_memcpy_4((dest + rounded_size), (src + rounded_size), (param_size - rounded_size));
-}
-
-int deepspeed_py_memcpy(torch::Tensor& dest, const torch::Tensor& src)
-{
-    auto dest_c = dest.contiguous();
-    auto src_c = src.contiguous();
-
-    float* dest_ptr = (float*)dest_c.data_ptr();
-    float* src_ptr = (float*)src_c.data_ptr();
-
-    helper_mempcy_8(dest_ptr, src_ptr, dest_c.size(0));
-
-    return 0;
-}
diff --git a/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_copy.h b/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_copy.h
deleted file mode 100644
index 69b044851eca1cbea461925fca2133f433e77533..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_copy.h
+++ /dev/null
@@ -1,42 +0,0 @@
-
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#if (__x86_64__ || __i386__)
-#include <cpuid.h>
-#include <x86intrin.h>
-#endif
-
-#include <deepspeed_aio_common.h>
-#include <stdlib.h>
-#include <torch/extension.h>
-
-#define TILE (1024 * 1024 * 1024)
-
-#if defined(__AVX512__)
-#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
-#define SIMD_LOAD(x) _mm512_loadu_ps(x)
-#define SIMD_SET(x) _mm512_set1_ps(x)
-#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
-#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
-#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
-#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
-#define SIMD_WIDTH 16
-#else
-#if defined(__AVX256__)
-#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
-#define SIMD_LOAD(x) _mm256_loadu_ps(x)
-#define SIMD_SET(x) _mm256_set1_ps(x)
-#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
-#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
-#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
-#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
-#define SIMD_WIDTH 8
-#endif
-#endif
-
-int deepspeed_py_memcpy(torch::Tensor& dest, const torch::Tensor& src);
diff --git a/deepspeed/ops/csrc/aio/py_lib/py_ds_aio.cpp b/deepspeed/ops/csrc/aio/py_lib/py_ds_aio.cpp
deleted file mode 100644
index 68590581ce2d985bc5209a73d9de4f515c987c30..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_lib/py_ds_aio.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <torch/extension.h>
-#include "deepspeed_py_aio_handle.h"
-#include "deepspeed_py_copy.h"
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("aio_read", &deepspeed_py_aio_read, "DeepSpeed Asynchronous I/O Read");
-
-    m.def("aio_write", &deepspeed_py_aio_write, "DeepSpeed Asynchronous I/O Write");
-
-    m.def("deepspeed_memcpy", &deepspeed_py_memcpy, "DeepSpeed Memory Copy");
-
-    py::class_<deepspeed_aio_handle_t>(m, "aio_handle")
-        .def(py::init<const int, const int, const bool, const bool, const int>())
-
-        .def("get_block_size", &deepspeed_aio_handle_t::get_block_size)
-        .def("get_queue_depth", &deepspeed_aio_handle_t::get_queue_depth)
-        .def("get_single_submit", &deepspeed_aio_handle_t::get_single_submit)
-        .def("get_overlap_events", &deepspeed_aio_handle_t::get_overlap_events)
-        .def("get_thread_count", &deepspeed_aio_handle_t::get_thread_count)
-
-        .def("read", &deepspeed_aio_handle_t::read)
-        .def("write", &deepspeed_aio_handle_t::write)
-
-        .def("pread", &deepspeed_aio_handle_t::pread)
-        .def("pwrite", &deepspeed_aio_handle_t::pwrite)
-
-        .def("sync_pread", &deepspeed_aio_handle_t::sync_pread)
-        .def("sync_pwrite", &deepspeed_aio_handle_t::sync_pwrite)
-        .def("async_pread", &deepspeed_aio_handle_t::async_pread)
-        .def("async_pwrite", &deepspeed_aio_handle_t::async_pwrite)
-
-        .def("wait", &deepspeed_aio_handle_t::wait);
-}
diff --git a/deepspeed/ops/csrc/aio/py_test/aio_bench_generate_param.py b/deepspeed/ops/csrc/aio/py_test/aio_bench_generate_param.py
deleted file mode 100644
index caa833f5febbe26eabf3b155a236fa331899667c..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_test/aio_bench_generate_param.py
+++ /dev/null
@@ -1,96 +0,0 @@
-"""
-Copyright 2021 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-import os
-import argparse
-import json
-from parse_aio_stats import READ_SPEED, WRITE_SPEED, get_sorted_results
-from perf_sweep_utils import BENCH_LOG_DIR, READ_LOG_DIR, WRITE_LOG_DIR
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        '--log_dir',
-        type=str,
-        default=BENCH_LOG_DIR,
-        help=
-        f'Folder of performance sweep logs. Default is {os.path.join(".", BENCH_LOG_DIR)}'
-    )
-
-    args = parser.parse_args()
-    print(f'args = {args}')
-
-    return args
-
-
-def validate_args(args):
-    for d in [READ_LOG_DIR, WRITE_LOG_DIR]:
-        log_dir = os.path.join(args.log_dir, d)
-        if not os.path.isdir(log_dir):
-            print(f'{log_dir} folder is not existent')
-            return False
-
-    return True
-
-
-def convert_to_param(key):
-    assert len(key) == 6
-    return {
-        "single_submit": "true" if key[0] == "single" else "false",
-        "overlap_events": "true" if key[1] == "overlap" else "false",
-        "thread_count": int(key[3]),
-        "queue_depth": int(key[4]),
-        "block_size": int(key[5])
-    }
-
-
-def generate_aio_param(read_log_dir, write_log_dir):
-    _, read_results = get_sorted_results(read_log_dir, READ_SPEED)
-    _, write_results = get_sorted_results(write_log_dir, WRITE_SPEED)
-    combined_perf = {key[1:]: value for key, value in read_results.items()}
-
-    for key, value in write_results.items():
-        new_key = key[1:]
-        if new_key in combined_perf:
-            combined_perf[new_key] += value
-        else:
-            combined_perf[new_key] = 0
-
-    optimal_key = None
-    optimal_perf = 0.0
-    for key, value in combined_perf.items():
-        if value > optimal_perf:
-            optimal_perf = value
-            optimal_key = key
-
-    aio_param = {"aio": convert_to_param(optimal_key)}
-
-    read_perf_keys = {key[1:]: key for key in read_results.keys()}
-    write_perf_keys = {key[1:]: key for key in write_results.keys()}
-    optimal_config_read = read_results.get(read_perf_keys[optimal_key], None)
-    optimal_config_write = write_results.get(write_perf_keys[optimal_key], None)
-
-    print(
-        f'Best performance (GB/sec): read = {optimal_config_read:5.2f}, write = {optimal_config_write:5.2f}'
-    )
-    print(json.dumps(aio_param, indent=3))
-
-
-def main():
-    print('Generate aio param')
-    args = parse_arguments()
-    if not validate_args(args):
-        quit()
-
-    read_log_dir = os.path.join(args.log_dir, READ_LOG_DIR)
-    write_log_dir = os.path.join(args.log_dir, WRITE_LOG_DIR)
-    generate_aio_param(read_log_dir, write_log_dir)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/deepspeed/ops/csrc/aio/py_test/aio_bench_perf_sweep.py b/deepspeed/ops/csrc/aio/py_test/aio_bench_perf_sweep.py
deleted file mode 100644
index be6cd74f7ac6fff899a56a80d9e3969013243c12..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_test/aio_bench_perf_sweep.py
+++ /dev/null
@@ -1,397 +0,0 @@
-"""
-Copyright 2021 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-import os
-import sys
-import argparse
-import json
-import itertools
-import subprocess
-import shutil
-
-from test_ds_aio_utils import refine_integer_value
-from perf_sweep_utils import READ_OP_DESC, WRITE_OP_DESC, BENCH_LOG_DIR, \
-    READ_IO_DIR, WRITE_IO_DIR, READ_LOG_DIR, WRITE_LOG_DIR
-
-OTHER_OPTIONS = '--handle'
-PERF_SCRIPT = 'test_ds_aio.py'
-DEFAULT_SWEEP_CONFIG = {
-    "block_size": ["128K",
-                   "256K"],
-    "queue_depth": [4,
-                    16,
-                    32],
-    "overlap_events": [True,
-                       False],
-    "io_parallel": [2,
-                    8],
-    "single_submit": [False]
-}
-
-
-class Job(object):
-    def __init__(self, cmd_line, output_file=None, work_dir=None):
-        self.cmd_line = cmd_line
-        self.output_file = output_file
-        self.work_dir = work_dir
-        self.output_fd = None
-
-    def cmd(self):
-        return self.cmd_line
-
-    def get_stdout(self):
-        return self.output_fd
-
-    def get_stderr(self):
-        return self.output_fd
-
-    def get_cwd(self):
-        return self.work_dir
-
-    def open_output_file(self):
-        if self.output_file is not None:
-            self.output_fd = open(self.output_file, 'w')
-
-    def close_output_file(self):
-        if self.output_fd is not None:
-            self.output_fd.close()
-            self.output_fd = None
-
-
-class SweepConfig(object):
-    def __init__(self, args):
-        self.nvme_dir = args.nvme_dir
-        self.io_size = args.io_size
-        self.search_space = get_sweep_config_dict(args.sweep_config)
-        self.read = not args.no_read
-        self.write = not args.no_write
-        self.flush_cache = not args.no_sudo
-        self.log_dir = args.log_dir
-        self.loops = args.loops
-        self.other_options = f'{OTHER_OPTIONS} --loops {args.loops}'
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        '--nvme_dir',
-        required=True,
-        type=str,
-        help=
-        'Directory in which to perform I/O tests. A writeable directory on a NVMe device.'
-    )
-
-    parser.add_argument('--sweep_config',
-                        type=str,
-                        default=None,
-                        help='Performance sweep configuration json file.')
-
-    parser.add_argument('--no_read',
-                        action='store_true',
-                        help='Disable read performance measurements.')
-
-    parser.add_argument('--no_write',
-                        action='store_true',
-                        help='Disable write performance measurements.')
-
-    parser.add_argument(
-        '--io_size',
-        type=str,
-        default="400M",
-        help='Number of I/O bytes to read/write for performance measurements.')
-
-    parser.add_argument(
-        '--no_sudo',
-        action='store_true',
-        help=
-        'Run without sudo access. Page cache will not be flushed and reported read speeds may be higher than actual.'
-    )
-
-    parser.add_argument(
-        '--log_dir',
-        type=str,
-        default=BENCH_LOG_DIR,
-        help=
-        f'Output directory for performance log files. Default is {os.path.join(".", BENCH_LOG_DIR)}'
-    )
-
-    parser.add_argument('--loops',
-                        type=int,
-                        default=1,
-                        help='Count of operation repetitions')
-
-    args = parser.parse_args()
-    print(f'args = {args}')
-
-    return args
-
-
-def dump_cmd_lines(cmd_lines):
-    print(f'cmd line count =  {len(cmd_lines)}')
-    for i, cmd in enumerate(cmd_lines):
-        print(f'{i}:  {cmd}')
-
-
-def get_sweep_config_dict(sweep_config_json):
-    if sweep_config_json is None:
-        return DEFAULT_SWEEP_CONFIG
-
-    with open(sweep_config_json) as fp:
-        sweep_config = json.load(fp)
-    return sweep_config
-
-
-def get_sweep_cmd_lines(sweep_config_dict):
-    def flatten_options(key, value_list):
-        flat_list = []
-        for v in value_list:
-            if not type(v) is bool:
-                flat_list.append(f'--{key} {v}')
-            elif v:
-                flat_list.append(f'--{key}')
-            else:
-                flat_list.append(' ')
-
-        return flat_list
-
-    flat_list = [flatten_options(key, value) for key, value in sweep_config_dict.items()]
-    cmd_list = list(itertools.product(*flat_list))
-    cmd_list = [list(cmd) for cmd in cmd_list]
-    #dump_cmd_lines(cmd_list)
-    return cmd_list
-
-
-def run_job(job):
-    args = ' '.join(job.cmd())
-    print(f'args = {args}')
-    job.open_output_file()
-    proc = subprocess.run(args=args,
-                          shell=True,
-                          stdout=job.get_stdout(),
-                          stderr=job.get_stderr(),
-                          cwd=job.get_cwd())
-    job.close_output_file()
-    assert proc.returncode == 0, \
-    f"This command failed: {job.cmd()}"
-
-
-def launch_sweep(sweep_jobs, sync_job, flush_cache_job):
-    for perf_job in sweep_jobs:
-        if flush_cache_job is not None:
-            run_job(sync_job)
-            run_job(flush_cache_job)
-
-        run_job(perf_job)
-
-        run_job(sync_job)
-
-
-def create_cmd_tags(cmd_line):
-    tags = {}
-    for param_value in cmd_line:
-        fields = param_value.split()
-        if len(fields) == 1:
-            tags[fields[0]] = None
-        elif len(fields) == 2:
-            tags[fields[0]] = fields[1]
-    return tags
-
-
-def get_log_file(io_op_desc, cmd_line):
-    QUEUE_DEPTH = "--queue_depth"
-    BLOCK_SIZE = "--block_size"
-    SINGLE_SUBMIT = "--single_submit"
-    OVERLAP_EVENTS = "--overlap_events"
-    THREAD_COUNT = "--threads"
-    IO_PARALLEL = "--io_parallel"
-
-    tag_map = {
-        QUEUE_DEPTH: "d",
-        BLOCK_SIZE: "bs",
-        SINGLE_SUBMIT: "single",
-        OVERLAP_EVENTS: "overlap",
-        THREAD_COUNT: "t",
-        IO_PARALLEL: "p"
-    }
-
-    tag_default = {
-        QUEUE_DEPTH: 1,
-        BLOCK_SIZE: "1M",
-        SINGLE_SUBMIT: "block",
-        OVERLAP_EVENTS: "sequential",
-        THREAD_COUNT: 1,
-        IO_PARALLEL: 1
-    }
-
-    def get_default_value(tag):
-        value = tag_default[tag]
-        if tag in [SINGLE_SUBMIT, OVERLAP_EVENTS]:
-            return value
-        return f'{tag_map[tag]}{value}'
-
-    def get_config_value(tag, value):
-        tag_key = tag_map[tag]
-        if value is None:
-            return tag_key
-        return f'{tag_key}{value}'
-
-    tag_list = [
-        SINGLE_SUBMIT,
-        OVERLAP_EVENTS,
-        THREAD_COUNT,
-        IO_PARALLEL,
-        QUEUE_DEPTH,
-        BLOCK_SIZE
-    ]
-    log_tags = [io_op_desc]
-    cmd_tags = create_cmd_tags(cmd_line)
-    for tag in tag_list:
-        if tag in cmd_tags:
-            log_tags.append(get_config_value(tag, cmd_tags[tag]))
-        else:
-            log_tags.append(get_default_value(tag))
-
-    log_file = '_'.join(log_tags)
-    log_file += '.txt'
-    return log_file
-
-
-def create_perf_jobs(io_op_desc, log_dir, cmd_lines):
-    py_cmd = ['python', os.path.join(script_path(), PERF_SCRIPT)]
-
-    perf_jobs = []
-    for cmd in cmd_lines:
-        log_file = os.path.join(log_dir, get_log_file(io_op_desc, cmd))
-        job = Job(cmd_line=py_cmd + cmd, output_file=log_file)
-        perf_jobs.append(job)
-
-    return perf_jobs
-
-
-def script_path():
-    return os.path.dirname(os.path.realpath(sys.argv[0]))
-
-
-def async_io_setup():
-    import deepspeed
-    from deepspeed.ops.aio import AsyncIOBuilder
-    return AsyncIOBuilder().is_compatible()
-
-
-def get_block_size_and_count(io_bytes):
-    block_size = 1
-    block_count = io_bytes
-    bytes_in_KB = 1024
-
-    while block_count % bytes_in_KB == 0:
-        block_size *= bytes_in_KB
-        block_count /= bytes_in_KB
-
-    return int(block_size), int(block_count)
-
-
-def create_read_file(sweep_config):
-    read_folder = os.path.join(sweep_config.nvme_dir, f'{READ_IO_DIR}')
-    os.makedirs(read_folder, exist_ok=True)
-    read_file_name = os.path.join(read_folder, f'random_{sweep_config.io_size}B.pt')
-    block_size, block_count = get_block_size_and_count(refine_integer_value(sweep_config.io_size))
-    dd_job = Job(cmd_line=[
-        f'dd if=/dev/urandom of={read_file_name} bs={block_size} count={block_count}'
-    ])
-    print(
-        f'[Start] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....'
-    )
-    run_job(dd_job)
-    print(
-        f'[Done] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....'
-    )
-    return read_folder, read_file_name
-
-
-def remove_folder(folder):
-    assert os.path.isdir(folder), f"Error: cannot remove {folder} - folder not found"
-    shutil.rmtree(folder)
-
-
-def run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
-    read_folder, read_file_name = create_read_file(sweep_config)
-    read_option = f'--read_file {read_file_name}'
-    read_cmd_lines = [[f'{read_option} {sweep_config.other_options}'] + cmd
-                      for cmd in cmd_lines]
-    #dump_cmd_lines(read_cmd_lines)
-
-    log_folder = os.path.join(sweep_config.log_dir, f'{READ_LOG_DIR}')
-    os.makedirs(log_folder, exist_ok=True)
-
-    perf_jobs = create_perf_jobs(io_op_desc=READ_OP_DESC,
-                                 log_dir=log_folder,
-                                 cmd_lines=read_cmd_lines)
-
-    launch_sweep(sweep_jobs=perf_jobs,
-                 sync_job=sync_job,
-                 flush_cache_job=flush_cache_job)
-
-    remove_folder(read_folder)
-
-
-def run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
-    write_folder = os.path.join(sweep_config.nvme_dir, f'{WRITE_IO_DIR}')
-    os.makedirs(write_folder, exist_ok=True)
-    write_file_name = os.path.join(write_folder, f'random_{sweep_config.io_size}B.pt')
-    write_option = f'--write_size {sweep_config.io_size} --write_file {write_file_name}'
-    write_cmd_lines = [[f'{write_option} {sweep_config.other_options}'] + cmd
-                       for cmd in cmd_lines]
-    #dump_cmd_lines(write_cmd_lines)
-
-    log_folder = os.path.join(sweep_config.log_dir, f'{WRITE_LOG_DIR}')
-    os.makedirs(log_folder, exist_ok=True)
-
-    perf_jobs = create_perf_jobs(io_op_desc=WRITE_OP_DESC,
-                                 log_dir=log_folder,
-                                 cmd_lines=write_cmd_lines)
-
-    launch_sweep(sweep_jobs=perf_jobs,
-                 sync_job=sync_job,
-                 flush_cache_job=flush_cache_job)
-
-    remove_folder(write_folder)
-
-
-def main():
-    print("Running performance sweep of deepspeed nvme library")
-
-    if not async_io_setup():
-        error_msg = """
-            Failing because environment is not properly configured for deepspeed async i/o module.
-            Possible fix: apt install libaio-dev.
-        """
-        print(error_msg)
-        quit()
-
-    args = parse_arguments()
-    sweep_config = SweepConfig(args)
-    cmd_lines = get_sweep_cmd_lines(sweep_config.search_space)
-
-    if sweep_config.flush_cache:
-        flush_cache_job = Job(
-            cmd_line=['sudo',
-                      'bash -c',
-                      "'echo 1 > /proc/sys/vm/drop_caches'"])
-    else:
-        flush_cache_job = None
-
-    sync_job = Job(cmd_line=['sync'])
-
-    if sweep_config.read:
-        run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines)
-
-    if sweep_config.write:
-        run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/deepspeed/ops/csrc/aio/py_test/ds_aio_basic.py b/deepspeed/ops/csrc/aio/py_test/ds_aio_basic.py
deleted file mode 100644
index cf70b6655e9c1366371d24a6fb33808c41729e93..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_test/ds_aio_basic.py
+++ /dev/null
@@ -1,144 +0,0 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-
-import torch
-import os
-import time
-from deepspeed.ops.aio import AsyncIOBuilder
-from multiprocessing import Pool, Barrier
-from test_ds_aio_utils import report_results, task_log, task_barrier
-
-
-def pre_basic(args, tid, read_op):
-    io_string = "Read" if read_op else "Write"
-    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
-    file = args.read_file if read_op else f'{args.write_file}.{tid}'
-
-    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
-    buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
-    task_log(
-        tid,
-        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
-    )
-
-    ctxt = {}
-    ctxt['file'] = file
-    ctxt['num_bytes'] = num_bytes
-    ctxt['buffer'] = buffer
-    ctxt['elapsed_sec'] = 0
-
-    return ctxt
-
-
-def pre_basic_read(pool_params):
-    args, tid = pool_params
-    ctxt = pre_basic(args, tid, True)
-    return ctxt
-
-
-def pre_basic_write(pool_params):
-    args, tid = pool_params
-    ctxt = pre_basic(args, tid, False)
-    return ctxt
-
-
-def post_basic(pool_params):
-    _, _, ctxt = pool_params
-    ctxt["buffer"].detach()
-    ctxt["buffer"] = None
-    return ctxt
-
-
-def main_basic_read(pool_params):
-    args, tid, ctxt = pool_params
-    start_time = time.time()
-    AsyncIOBuilder().load().aio_read(ctxt['buffer'],
-                                     ctxt['file'],
-                                     args.block_size,
-                                     args.queue_depth,
-                                     args.single_submit,
-                                     args.overlap_events,
-                                     args.validate)
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def main_basic_write(pool_params):
-    args, tid, ctxt = pool_params
-    start_time = time.time()
-    AsyncIOBuilder().load().aio_write(ctxt['buffer'],
-                                      ctxt['file'],
-                                      args.block_size,
-                                      args.queue_depth,
-                                      args.single_submit,
-                                      args.overlap_events,
-                                      args.validate)
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def get_schedule(args, read_op):
-    schedule = {}
-    if read_op:
-        schedule['pre'] = pre_basic_read
-        schedule['post'] = post_basic
-        schedule['main'] = main_basic_read
-    else:
-        schedule['pre'] = pre_basic_write
-        schedule['post'] = post_basic
-        schedule['main'] = main_basic_write
-
-    return schedule
-
-
-def _aio_handle_tasklet(pool_params):
-    args, tid, read_op = pool_params
-
-    # Create schedule
-    schedule = get_schedule(args, read_op)
-    task_log(tid, f'schedule = {schedule}')
-    task_barrier(aio_barrier, args.threads)
-
-    # Run pre task
-    task_log(tid, f'running pre-task')
-    ctxt = schedule["pre"]((args, tid))
-    task_barrier(aio_barrier, args.threads)
-
-    # Run main tasks in a loop
-    ctxt["main_task_sec"] = 0
-    for i in range(args.loops):
-        task_log(tid, f'running main task {i}')
-        start_time = time.time()
-        ctxt = schedule["main"]((args, tid, ctxt))
-        task_barrier(aio_barrier, args.threads)
-        stop_time = time.time()
-        ctxt["main_task_sec"] += stop_time - start_time
-
-    # Run post task
-    task_log(tid, f'running post-task')
-    ctxt = schedule["post"]((args, tid, ctxt))
-    task_barrier(aio_barrier, args.threads)
-
-    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
-
-
-def _init_tasklet(b):
-    global aio_barrier
-    aio_barrier = b
-
-
-def aio_basic_multiprocessing(args, read_op):
-    b = Barrier(args.threads)
-    pool_params = [(args, p, read_op) for p in range(args.threads)]
-    with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
-        pool_results = p.map(_aio_handle_tasklet, pool_params)
-
-    report_results(args, read_op, pool_results)
diff --git a/deepspeed/ops/csrc/aio/py_test/ds_aio_handle.py b/deepspeed/ops/csrc/aio/py_test/ds_aio_handle.py
deleted file mode 100644
index 947ee2e6cb633e52c33c4b0ce06c56ad75b73f4c..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_test/ds_aio_handle.py
+++ /dev/null
@@ -1,176 +0,0 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-
-import torch
-import os
-import time
-from multiprocessing import Pool, Barrier
-from deepspeed.ops.aio import AsyncIOBuilder
-from test_ds_aio_utils import report_results, task_log, task_barrier
-
-
-def pre_handle(args, tid, read_op):
-    io_string = "Read" if read_op else "Write"
-    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
-    file = args.read_file if read_op else f'{args.write_file}.{tid}'
-
-    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
-    if args.gpu:
-        buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cuda')
-    else:
-        buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
-    task_log(
-        tid,
-        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
-    )
-
-    io_parallel = args.io_parallel if args.io_parallel else 1
-    handle = AsyncIOBuilder().load().aio_handle(args.block_size,
-                                                args.queue_depth,
-                                                args.single_submit,
-                                                args.overlap_events,
-                                                io_parallel)
-    task_log(tid, f'created deepspeed aio handle')
-
-    ctxt = {}
-    ctxt['file'] = file
-    ctxt['num_bytes'] = num_bytes
-    ctxt['handle'] = handle
-    ctxt['buffer'] = buffer
-    ctxt['elapsed_sec'] = 0
-
-    return ctxt
-
-
-def pre_handle_read(pool_params):
-    args, tid = pool_params
-    ctxt = pre_handle(args, tid, True)
-    return ctxt
-
-
-def pre_handle_write(pool_params):
-    args, tid = pool_params
-    ctxt = pre_handle(args, tid, False)
-    return ctxt
-
-
-def post_handle(pool_params):
-    _, _, ctxt = pool_params
-    ctxt["buffer"].detach()
-    ctxt["buffer"] = None
-    return ctxt
-
-
-def main_parallel_read(pool_params):
-    args, tid, ctxt = pool_params
-    handle = ctxt['handle']
-
-    start_time = time.time()
-    ret = handle.pread(ctxt['buffer'], ctxt['file'], args.validate, True)
-    assert ret != -1
-    handle.wait()
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def main_parallel_write(pool_params):
-    args, tid, ctxt = pool_params
-    handle = ctxt['handle']
-    start_time = time.time()
-    ret = handle.pwrite(ctxt['buffer'], ctxt['file'], args.validate, True)
-    assert ret != -1
-    handle.wait()
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def main_handle_read(pool_parms):
-    args, tid, ctxt = pool_parms
-    handle = ctxt['handle']
-
-    start_time = time.time()
-    ret = handle.read(ctxt['buffer'], ctxt['file'], args.validate)
-    assert ret != -1
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def main_handle_write(pool_parms):
-    args, tid, ctxt = pool_parms
-    handle = ctxt['handle']
-    start_time = time.time()
-    ret = handle.write(ctxt['buffer'], ctxt['file'], args.validate)
-    assert ret != -1
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def get_schedule(args, read_op):
-    schedule = {}
-    if read_op:
-        schedule['pre'] = pre_handle_read
-        schedule['post'] = post_handle
-        schedule['main'] = main_parallel_read if args.io_parallel else main_handle_read
-    else:
-        schedule['pre'] = pre_handle_write
-        schedule['post'] = post_handle
-        schedule['main'] = main_parallel_write if args.io_parallel else main_handle_write
-
-    return schedule
-
-
-def _aio_handle_tasklet(pool_params):
-    args, tid, read_op = pool_params
-
-    # Create schedule
-    schedule = get_schedule(args, read_op)
-    task_log(tid, f'schedule = {schedule}')
-    task_barrier(aio_barrier, args.threads)
-
-    # Run pre task
-    task_log(tid, f'running pre-task')
-    ctxt = schedule["pre"]((args, tid))
-    task_barrier(aio_barrier, args.threads)
-
-    # Run main tasks in a loop
-    ctxt["main_task_sec"] = 0
-    for i in range(args.loops):
-        task_log(tid, f'running main task {i}')
-        start_time = time.time()
-        ctxt = schedule["main"]((args, tid, ctxt))
-        task_barrier(aio_barrier, args.threads)
-        stop_time = time.time()
-        ctxt["main_task_sec"] += stop_time - start_time
-
-    # Run post task
-    task_log(tid, f'running post-task')
-    ctxt = schedule["post"]((args, tid, ctxt))
-    task_barrier(aio_barrier, args.threads)
-
-    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
-
-
-def _init_tasklet(b):
-    global aio_barrier
-    aio_barrier = b
-
-
-def aio_handle_multiprocessing(args, read_op):
-    b = Barrier(args.threads)
-    pool_params = [(args, p, read_op) for p in range(args.threads)]
-    with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
-        pool_results = p.map(_aio_handle_tasklet, pool_params)
-
-    report_results(args, read_op, pool_results)
diff --git a/deepspeed/ops/csrc/aio/py_test/parse_aio_stats.py b/deepspeed/ops/csrc/aio/py_test/parse_aio_stats.py
deleted file mode 100644
index 1921973e4f735ffbe0cc0d67b0f970e4c15a47ab..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_test/parse_aio_stats.py
+++ /dev/null
@@ -1,154 +0,0 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-
-import os
-import argparse
-import re
-
-READ_SPEED = 'read_speed'
-WRITE_SPEED = 'write_speed'
-
-PERF_METRICS = [READ_SPEED, WRITE_SPEED]
-
-METRIC_SEARCH = {READ_SPEED: 'E2E Read Speed', WRITE_SPEED: 'E2E Write Speed'}
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument('--log_dir',
-                        type=str,
-                        required=True,
-                        help='Folder of statistics logs')
-
-    parser.add_argument('--metric',
-                        type=str,
-                        required=True,
-                        help='Performance metric to report: [read_speed|write_speed]')
-
-    args = parser.parse_args()
-    print(f'args = {args}')
-
-    return args
-
-
-def extract_value(key, file):
-    INVALID_PREFIXES = ["ds"]
-    for p in INVALID_PREFIXES:
-        if key.startswith(p):
-            return key
-    try:
-        if key[0] in ['t', 'd', 'p']:
-            return int(key[1:])
-        if key.startswith("bs"):
-            if key.endswith('K'):
-                v = key[2:].split('K')
-                return int(v[0]) * 1024
-            elif key.endswith('M'):
-                v = key[2:].split('M')
-                return int(v[0]) * 1024 * 1024
-            else:
-                return int(key[2:])
-    except:
-        print(f"{file}: extract_value fails on {key}")
-        return None
-
-    return key
-
-
-def get_file_key(file):
-    f, _ = os.path.splitext(os.path.basename(file))
-    fields = f.split('_')
-    values = [extract_value(k, file) for k in fields]
-    return tuple(values)
-
-
-def get_thread_count(file):
-    f, _ = os.path.splitext(os.path.basename(file))
-    fields = f.split('_')
-    for key in fields:
-        if key[0] == 't':
-            return int(key[1:])
-    return 1
-
-
-"""
-Extract performance metric from log file.
-Sample file lines are:
-Task Read Latency = 0.031647682189941406 sec
-Task Read Speed = 12.342926020792527 GB/sec
-E2E Read Latency = 0.031697988510131836 sec
-E2E Read Speed = 12.323337169333062 GB/sec
-
-For the above sample, -metric = "read_speed" corresponds to "E2E Read Speed", and 12.32 will be returned
-"""
-
-
-def get_metric(file, metric):
-    thread_count = get_thread_count(file)
-    with open(file) as f:
-        for line in f.readlines():
-            if line.startswith(METRIC_SEARCH[metric]):
-                if metric in [READ_SPEED, WRITE_SPEED]:
-                    fields = line.split()
-                    return float(fields[-2])
-                else:
-                    fields = line.split('=')
-                    return float(fields[-1])
-
-    return None
-
-
-def validate_args(args):
-    if not args.metric in PERF_METRICS:
-        print(f'{args.metric} is not a valid performance metrics')
-        return False
-
-    if not os.path.isdir(args.log_dir):
-        print(f'{args.log_dir} folder is not existent')
-        return False
-
-    return True
-
-
-def get_results(log_files, metric):
-    results = {}
-    for f in log_files:
-        file_key = get_file_key(f)
-        value = get_metric(f, metric)
-        results[file_key] = value
-
-    return results
-
-
-def get_sorted_results(log_dir, metric):
-    log_files = [
-        f for f in os.listdir(log_dir) if os.path.isfile(os.path.join(log_dir,
-                                                                      f))
-    ]
-
-    log_files_path = [os.path.join(log_dir, f) for f in log_files]
-    results = get_results(log_files_path, metric)
-    result_keys = list(results.keys())
-    sorted_keys = sorted(result_keys)
-    return sorted_keys, results
-
-
-def main():
-    print("Parsing aio statistics")
-    args = parse_arguments()
-
-    if not validate_args(args):
-        quit()
-
-    sorted_keys, results = get_sorted_results(args.log_dir, args.metric)
-    for k in sorted_keys:
-        print(f'{k} = {results[k]}')
-
-
-if __name__ == "__main__":
-    main()
diff --git a/deepspeed/ops/csrc/aio/py_test/perf_sweep_utils.py b/deepspeed/ops/csrc/aio/py_test/perf_sweep_utils.py
deleted file mode 100644
index 2fd1a4c55d0fd90090edb223c0303f90be5a0db9..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_test/perf_sweep_utils.py
+++ /dev/null
@@ -1,8 +0,0 @@
-SCRIPT_PREFIX = '_aio_bench'
-WRITE_OP_DESC = 'write'
-READ_OP_DESC = 'read'
-READ_IO_DIR = f'{SCRIPT_PREFIX}_{READ_OP_DESC}_io'
-WRITE_IO_DIR = f'{SCRIPT_PREFIX}_{WRITE_OP_DESC}_io'
-BENCH_LOG_DIR = f'{SCRIPT_PREFIX}_logs'
-READ_LOG_DIR = f'{SCRIPT_PREFIX}_{READ_OP_DESC}_logs'
-WRITE_LOG_DIR = f'{SCRIPT_PREFIX}_{WRITE_OP_DESC}_logs'
diff --git a/deepspeed/ops/csrc/aio/py_test/run_read_sweep.sh b/deepspeed/ops/csrc/aio/py_test/run_read_sweep.sh
deleted file mode 100644
index b9d7e050454a8382878b250442de3ffd7a07793d..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_test/run_read_sweep.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/bin/bash
-if [[ $# -ne 2 ]]; then
-    echo "Usage: $0 <input file> <output log dir>"
-    exit 1
-fi
-
-
-function validate_environment()
-{
-    validate_cmd="python ./validate_async_io.py"
-    eval ${validate_cmd}
-    res=$?
-    if [[ $res != 0 ]]; then
-        echo "Failing because environment is not properly configured"
-        echo "Possible fix: sudo apt-get install libaio-dev"
-        exit 1
-    fi
-}
-
-
-validate_environment
-
-INPUT_FILE=$1
-if [[ ! -f ${INPUT_FILE} ]]; then
-    echo "Input file not found: ${INPUT_FILE}"
-    exit 1
-fi
-
-LOG_DIR=$2/aio_perf_sweep
-RUN_SCRIPT=./test_ds_aio.py
-READ_OPT="--read_file ${INPUT_FILE}"
-
-if [[ -d ${LOG_DIR} ]]; then
-    rm -f ${LOG_DIR}/*
-else
-    mkdir -p ${LOG_DIR}
-fi
-
-DISABLE_CACHE="sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
-SYNC="sync"
-
-for sub in single block; do
-    if [[ $sub == "single" ]]; then
-        sub_opt="--single_submit"
-    else
-        sub_opt=""
-    fi
-    for ov in overlap sequential; do
-        if [[ $ov == "overlap" ]]; then
-            ov_opt="--overlap_events"
-        else
-            ov_opt=""
-        fi
-        for t in 1 2 4 8; do
-            for p in 1 ; do
-                for d in 1 2 4 8 16 32; do
-                    for bs in 128K 256K 512K 1M; do
-                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle --threads ${t}"
-                        OPTS="--io_parallel ${p} --queue_depth ${d} --block_size ${bs}"
-                        LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
-                        cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
-                        echo ${DISABLE_CACHE}
-                        echo ${cmd}
-                        echo ${SYNC}
-
-                        eval ${DISABLE_CACHE}
-                        eval ${cmd}
-                        eval ${SYNC}
-                        sleep 2
-                    done
-                done
-            done
-        done
-    done
-done
diff --git a/deepspeed/ops/csrc/aio/py_test/run_write_sweep.sh b/deepspeed/ops/csrc/aio/py_test/run_write_sweep.sh
deleted file mode 100644
index 99f2113dda6fed0a9b4a2dffa4436fdf5241e8f7..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_test/run_write_sweep.sh
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-function prep_folder()
-{
-    folder=$1
-    if [[ -d ${folder} ]]; then
-        rm -f ${folder}/*
-    else
-        mkdir -p ${folder}
-    fi
-}
-
-function validate_environment()
-{
-    validate_cmd="python ./validate_async_io.py"
-    eval ${validate_cmd}
-    res=$?
-    if [[ $res != 0 ]]; then
-        echo "Failing because environment is not properly configured"
-        echo "Possible fix: sudo apt-get install libaio-dev"
-        exit 1
-    fi
-}
-
-
-
-validate_environment
-
-if [[ $# -ne 3 ]]; then
-    echo "Usage: $0 <write size in MB> <write dir ><output log dir>"
-    exit 1
-fi
-
-SIZE="$1M"
-WRITE_DIR=$2
-LOG_DIR=$3/aio_perf_sweep
-
-OUTPUT_FILE=${WRITE_DIR}/ds_aio_write_${SIZE}B.pt
-WRITE_OPT="--write_file ${OUTPUT_FILE} --write_size ${SIZE}"
-
-
-prep_folder ${WRITE_DIR}
-prep_folder ${LOG_DIR}
-
-RUN_SCRIPT=./test_ds_aio.py
-
-DISABLE_CACHE="sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
-SYNC="sync"
-
-for sub in single block; do
-    if [[ $sub == "single" ]]; then
-        sub_opt="--single_submit"
-    else
-        sub_opt=""
-    fi
-    for ov in overlap sequential; do
-        if [[ $ov == "overlap" ]]; then
-            ov_opt="--overlap_events"
-        else
-            ov_opt=""
-        fi
-        for t in 1 2 4 8; do
-            for p in 1; do
-                for d in 1 2 4 8 16 32; do
-                    for bs in 128K 256K 512K 1M; do
-                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle --threads ${t}"
-                        OPTS="--io_parallel ${p} --queue_depth ${d} --block_size ${bs}"
-                        LOG="${LOG_DIR}/write_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
-                        cmd="python ${RUN_SCRIPT} ${WRITE_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
-                        echo ${DISABLE_CACHE}
-                        echo ${cmd}
-                        echo ${SYNC}
-
-                        eval ${DISABLE_CACHE}
-                        eval ${cmd}
-                        eval ${SYNC}
-                        sleep 2
-                    done
-                done
-        done
-        done
-    done
-done
diff --git a/deepspeed/ops/csrc/aio/py_test/single_process_config.json b/deepspeed/ops/csrc/aio/py_test/single_process_config.json
deleted file mode 100644
index 275c54135cd83d3d8508ea1f769b823af9529821..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_test/single_process_config.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-    "block_size": [
-        "128K",
-        "256K",
-        "1M"
-    ],
-    "queue_depth": [
-        4,
-        16,
-        32
-    ],
-    "io_parallel": [
-        1,
-        2,
-        4,
-        8
-    ],
-    "single_submit": [
-        true,
-        false
-    ],
-    "overlap_events": [
-        true,
-        false
-    ],
-    "threads": [
-        1
-    ]
-}
diff --git a/deepspeed/ops/csrc/aio/py_test/test_ds_aio.py b/deepspeed/ops/csrc/aio/py_test/test_ds_aio.py
deleted file mode 100644
index f97d3e676c03c13d54c54b6cc23e24745b09f335..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_test/test_ds_aio.py
+++ /dev/null
@@ -1,101 +0,0 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-
-import os
-import torch
-import argparse
-import time
-import sys
-from multiprocessing import Pool
-import multiprocessing as mp
-from ds_aio_basic import aio_basic_multiprocessing
-from ds_aio_handle import aio_handle_multiprocessing
-from test_ds_aio_utils import refine_args
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument('--read_file', type=str, default=None, help='Read file.')
-
-    parser.add_argument('--write_file', type=str, default=None, help='Write file.')
-
-    parser.add_argument('--write_size',
-                        type=str,
-                        default=None,
-                        help='Number of bytes to write.')
-
-    parser.add_argument('--block_size', type=str, default='1M', help='I/O block size.')
-
-    parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth.')
-
-    parser.add_argument('--threads',
-                        type=int,
-                        default=1,
-                        help='Thread parallelism count.')
-
-    parser.add_argument(
-        '--single_submit',
-        action='store_true',
-        help=
-        'Submit I/O requests in singles (default is submit queue_depth amount at once.).'
-    )
-
-    parser.add_argument('--overlap_events',
-                        action='store_true',
-                        help='Overlap I/O submission and completion requests.')
-
-    parser.add_argument('--validate',
-                        action='store_true',
-                        help='Perform validation in library.')
-
-    parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
-
-    parser.add_argument('--loops',
-                        type=int,
-                        default=1,
-                        help='Count of operation repetitions')
-
-    parser.add_argument('--io_parallel',
-                        type=int,
-                        default=None,
-                        help='Per iop parallelism')
-
-    parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
-
-    args = parser.parse_args()
-    print(f'args = {args}')
-    return args
-
-
-def validate_args(args):
-    if args.read_file and not os.path.isfile(args.read_file):
-        print(f'args validation error: {args.read_file} not found')
-        return False
-
-    return True
-
-
-def main():
-    print(f'Testing deepspeed_aio python frontend')
-
-    args = parse_arguments()
-    refine_args(args)
-    if not validate_args(args):
-        quit()
-
-    mp.set_start_method('spawn')
-    multiprocess_function = aio_handle_multiprocessing if args.handle else aio_basic_multiprocessing
-    if args.read_file:
-        multiprocess_function(args, True)
-
-    if args.write_file:
-        multiprocess_function(args, False)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/deepspeed/ops/csrc/aio/py_test/test_ds_aio_utils.py b/deepspeed/ops/csrc/aio/py_test/test_ds_aio_utils.py
deleted file mode 100644
index c68dfdddc23343c5d3c0a623a4be33f11f78e628..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_test/test_ds_aio_utils.py
+++ /dev/null
@@ -1,59 +0,0 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-
-import os
-
-BYTES_PER_GB = 1024**3
-LOG_TIDS = [0]
-
-
-def task_log(tid, msg):
-    if tid in LOG_TIDS:
-        print(f'tid {tid}: {msg}')
-
-
-def task_barrier(barrier, num_parties):
-    assert barrier.parties == num_parties
-    barrier.wait()
-    assert barrier.broken == False
-
-
-def report_results(args, read_op, pool_results):
-    #print(f'pool_results = {pool_results}')
-    io_string = 'Read' if read_op else 'Write'
-    if None in pool_results:
-        print(f'Failure in one of {args.threads} {io_string} processes')
-        return
-
-    total_bytes = sum([num_bytes for _, _, num_bytes in pool_results])
-
-    task_latency_sec = max([sec for _, sec, _ in pool_results])
-    task_speed_GB = total_bytes / task_latency_sec / BYTES_PER_GB
-    print(f'Task {io_string} Latency = {task_latency_sec} sec')
-    print(f'Task {io_string} Speed = {task_speed_GB} GB/sec')
-
-    e2e_latency_sec = max([sec for sec, _, _ in pool_results])
-    e2e_speed_GB = total_bytes / e2e_latency_sec / BYTES_PER_GB
-    print(f'E2E {io_string} Latency = {e2e_latency_sec} sec')
-    print(f'E2E {io_string} Speed = {e2e_speed_GB} GB/sec')
-
-
-def refine_integer_value(value):
-    unit_dict = {'K': 1024, 'M': 1024**2, 'G': 1024**3}
-
-    if value[-1] in list(unit_dict.keys()):
-        int_value = int(value[:-1]) * unit_dict[value[-1]]
-        return int_value
-    return int(value)
-
-
-def refine_args(args):
-    if args.write_size and type(args.write_size) == str:
-        args.write_size = refine_integer_value(args.write_size)
-
-    if args.block_size and type(args.block_size) == str:
-        args.block_size = refine_integer_value(args.block_size)
diff --git a/deepspeed/ops/csrc/aio/py_test/validate_async_io.py b/deepspeed/ops/csrc/aio/py_test/validate_async_io.py
deleted file mode 100644
index ceae84c840da9ba8902310170e1c7afe782a63ce..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/aio/py_test/validate_async_io.py
+++ /dev/null
@@ -1,9 +0,0 @@
-"""
-Copyright 2021 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-import deepspeed
-from deepspeed.ops.aio import AsyncIOBuilder
-assert AsyncIOBuilder().is_compatible()
diff --git a/deepspeed/ops/csrc/common/custom_cuda_kernel.cu b/deepspeed/ops/csrc/common/custom_cuda_kernel.cu
deleted file mode 100644
index f7a2b5d480dffae5f44b4558302c2c96240444ba..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/common/custom_cuda_kernel.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-#include "custom_cuda_layers.h"
-
-__global__ void param_update_kernel(const float* input, __half* output, int size)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (id < size) { output[id] = (__half)input[id]; }
-}
-
-void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream)
-{
-    int threads = 1024;
-
-    dim3 grid_dim((size - 1) / threads + 1);
-    dim3 block_dim(threads);
-
-    param_update_kernel<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
-}
-
-__global__ void param_update_kernel_half(const float* input, __half* output, int size)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-    __half2* output_cast = reinterpret_cast<__half2*>(output);
-    if (id < size) {
-        float input_f = input[id];
-        __half2* input_h = reinterpret_cast<__half2*>(&input_f);
-        output_cast[id] = *input_h;
-    }
-}
-
-void launch_param_update_half(const float* input, __half* output, int size, cudaStream_t stream)
-{
-    int threads = 1024;
-    size /= 2;
-    dim3 grid_dim((size - 1) / threads + 1);
-    dim3 block_dim(threads);
-
-    param_update_kernel_half<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
-}
diff --git a/deepspeed/ops/csrc/common/custom_hip_kernel.hip b/deepspeed/ops/csrc/common/custom_hip_kernel.hip
deleted file mode 100644
index 119647c587ff9d3bacd8eb03ef6c97e0e4ed00a4..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/common/custom_hip_kernel.hip
+++ /dev/null
@@ -1,41 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-
-__global__ void param_update_kernel(const float* input, __half* output, int size)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (id < size) { output[id] = (__half)input[id]; }
-}
-
-void launch_param_update(const float* input, __half* output, int size, hipStream_t stream)
-{
-    int threads = 1024;
-
-    dim3 grid_dim((size - 1) / threads + 1);
-    dim3 block_dim(threads);
-
-   hipLaunchKernelGGL(( param_update_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
-}
-
-__global__ void param_update_kernel_half(const float* input, __half* output, int size)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-    __half2* output_cast = reinterpret_cast<__half2*>(output);
-    if (id < size) {
-        float input_f = input[id];
-        __half2* input_h = reinterpret_cast<__half2*>(&input_f);
-        output_cast[id] = *input_h;
-    }
-}
-
-void launch_param_update_half(const float* input, __half* output, int size, hipStream_t stream)
-{
-    int threads = 1024;
-    size /= 2;
-    dim3 grid_dim((size - 1) / threads + 1);
-    dim3 block_dim(threads);
-
-   hipLaunchKernelGGL(( param_update_kernel_half), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
-}
diff --git a/deepspeed/ops/csrc/includes/StopWatch.h b/deepspeed/ops/csrc/includes/StopWatch.h
deleted file mode 100644
index 9bf0401ebc78ffbe37c3b40d2466401731358051..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/StopWatch.h
+++ /dev/null
@@ -1,98 +0,0 @@
-#pragma once
-#ifdef _WIN32
-#include <windows.h>
-#else
-#include <time.h>
-#endif
-
-#ifdef _WIN32
-
-class Stopwatch {
-private:
-    double m_total_time;
-    LARGE_INTEGER m_start_time;
-
-public:
-    Stopwatch() { m_total_time = 0.0; }
-
-    ~Stopwatch() {}
-
-    void Reset() { m_total_time = 0.0; }
-
-    void Start() { QueryPerformanceCounter(&m_start_time); }
-
-    void Restart()
-    {
-        m_total_time = 0.0;
-        QueryPerformanceCounter(&m_start_time);
-    }
-
-    void Stop()
-    {
-        LARGE_INTEGER frequency;
-        LARGE_INTEGER stop_time;
-        QueryPerformanceFrequency(&frequency);
-        QueryPerformanceCounter(&stop_time);
-        m_total_time +=
-            ((double)(stop_time.QuadPart - m_start_time.QuadPart) / (double)frequency.QuadPart);
-    }
-
-    double GetTimeInSeconds() { return m_total_time; }
-};
-
-#else
-
-class Stopwatch {
-private:
-    double m_total_time;
-    struct timespec m_start_time;
-    bool m_is_started;
-
-public:
-    Stopwatch()
-    {
-        m_total_time = 0.0;
-        m_is_started = false;
-    }
-
-    ~Stopwatch() {}
-
-    void Reset() { m_total_time = 0.0; }
-
-    void Start()
-    {
-        clock_gettime(CLOCK_MONOTONIC, &m_start_time);
-        m_is_started = true;
-    }
-
-    void Restart()
-    {
-        m_total_time = 0.0;
-        clock_gettime(CLOCK_MONOTONIC, &m_start_time);
-        m_is_started = true;
-    }
-
-    void Stop()
-    {
-        if (m_is_started) {
-            m_is_started = false;
-
-            struct timespec end_time;
-            clock_gettime(CLOCK_MONOTONIC, &end_time);
-
-            m_total_time += (double)(end_time.tv_sec - m_start_time.tv_sec) +
-                            (double)(end_time.tv_nsec - m_start_time.tv_nsec) / 1e9;
-        }
-    }
-
-    double GetTimeInSeconds()
-    {
-        if (m_is_started) {
-            Stop();
-            Start();
-        }
-        return m_total_time;
-    }
-};
-
-#endif
diff --git a/deepspeed/ops/csrc/includes/Timer.h b/deepspeed/ops/csrc/includes/Timer.h
deleted file mode 100644
index efc7fff84abb86b91473d1a532c78bf16e387384..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/Timer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-
-#ifndef __TIMER_H__
-#define __TIMER_H__
-
-#include <cuda_runtime.h>
-#include <chrono>
-#include "cuda.h"
-
-class GPUTimer {
-    cudaEvent_t start, stop;
-
-public:
-    GPUTimer()
-    {
-        cudaEventCreate(&start);
-        cudaEventCreate(&stop);
-    }
-    ~GPUTimer()
-    {
-        cudaEventDestroy(start);
-        cudaEventDestroy(stop);
-    }
-    inline void Record() { cudaEventRecord(start); }
-    inline void Elapsed(float& time_elapsed)
-    {
-        cudaEventRecord(stop);
-        cudaEventSynchronize(stop);
-        cudaEventElapsedTime(&time_elapsed, start, stop);
-    }
-};
-
-class CPUTimer {
-    std::chrono::high_resolution_clock::time_point start;
-
-public:
-    CPUTimer() : start(std::chrono::high_resolution_clock::now()) {}
-    inline void Reset() { start = std::chrono::high_resolution_clock::now(); }
-    inline float Elapsed()
-    {
-        auto temp = start;
-        start = std::chrono::high_resolution_clock::now();
-        return (float)(std::chrono::duration_cast<std::chrono::microseconds>(start - temp).count() /
-                       1e3);
-    }
-};
-
-#endif
diff --git a/deepspeed/ops/csrc/includes/Timer_hip.h b/deepspeed/ops/csrc/includes/Timer_hip.h
deleted file mode 100644
index 2ee3d6f2944655ab1de9864b39d58c9edec53ebf..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/Timer_hip.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-
-#ifndef __TIMER_H__
-#define __TIMER_H__
-
-#include <hip/hip_runtime.h>
-#include <chrono>
-#include "hip/hip_runtime.h"
-
-class GPUTimer {
-    hipEvent_t start, stop;
-
-public:
-    GPUTimer()
-    {
-        hipEventCreate(&start);
-        hipEventCreate(&stop);
-    }
-    ~GPUTimer()
-    {
-        hipEventDestroy(start);
-        hipEventDestroy(stop);
-    }
-    inline void Record() { hipEventRecord(start); }
-    inline void Elapsed(float& time_elapsed)
-    {
-        hipEventRecord(stop);
-        hipEventSynchronize(stop);
-        hipEventElapsedTime(&time_elapsed, start, stop);
-    }
-};
-
-class CPUTimer {
-    std::chrono::high_resolution_clock::time_point start;
-
-public:
-    CPUTimer() : start(std::chrono::high_resolution_clock::now()) {}
-    inline void Reset() { start = std::chrono::high_resolution_clock::now(); }
-    inline float Elapsed()
-    {
-        auto temp = start;
-        start = std::chrono::high_resolution_clock::now();
-        return (float)(std::chrono::duration_cast<std::chrono::microseconds>(start - temp).count() /
-                       1e3);
-    }
-};
-
-#endif
diff --git a/deepspeed/ops/csrc/includes/compat.h b/deepspeed/ops/csrc/includes/compat.h
deleted file mode 100644
index 86f84a85065c9582119296223bb24193e71e060b..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/compat.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
-*/
-
-#ifndef TORCH_CHECK
-#define TORCH_CHECK AT_CHECK
-#endif
-
-#ifdef VERSION_GE_1_3
-#define DATA_PTR data_ptr
-#else
-#define DATA_PTR data
-#endif
diff --git a/deepspeed/ops/csrc/includes/context.h b/deepspeed/ops/csrc/includes/context.h
deleted file mode 100644
index 5f0424116546f3080c760338d4e02e40ae63be59..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/context.h
+++ /dev/null
@@ -1,171 +0,0 @@
-#pragma once
-
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda_runtime_api.h>
-#include <cassert>
-#include <iostream>
-#include <vector>
-#include "cublas_v2.h"
-#include "cuda.h"
-#include "curand.h"
-#include "gemm_test.h"
-
-#define WARP_SIZE 32
-
-#define CUDA_CHECK(callstr)                                                                    \
-    {                                                                                          \
-        cudaError_t error_code = callstr;                                                      \
-        if (error_code != cudaSuccess) {                                                       \
-            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
-            assert(0);                                                                         \
-        }                                                                                      \
-    }
-
-#define CUDA_1D_KERNEL_LOOP(i, n) \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
-
-#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                                                          \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
-        for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
-
-#define DS_CUDA_NUM_THREADS 512
-#define DS_MAXIMUM_NUM_BLOCKS 262144
-
-inline int DS_GET_BLOCKS(const int N)
-{
-    return (std::max)(
-        (std::min)((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
-        // Use at least 1 block, since CUDA does not allow empty block
-        1);
-}
-
-class Context {
-public:
-    Context() : _workspace(nullptr), _seed(42), _curr_offset(0)
-    {
-        curandCreateGenerator(&_gen, CURAND_RNG_PSEUDO_DEFAULT);
-        curandSetPseudoRandomGeneratorSeed(_gen, 123);
-        if (cublasCreate(&_cublasHandle) != CUBLAS_STATUS_SUCCESS) {
-            auto message = std::string("Fail to create cublas handle.");
-            std::cerr << message << std::endl;
-            throw std::runtime_error(message);
-        }
-    }
-
-    virtual ~Context()
-    {
-        cublasDestroy(_cublasHandle);
-        cudaFree(_workspace);
-    }
-
-    static Context& Instance()
-    {
-        static Context _ctx;
-        return _ctx;
-    }
-
-    void SetWorkSpace(void* workspace)
-    {
-        if (!workspace) { throw std::runtime_error("Workspace is null."); }
-        _workspace = workspace;
-    }
-
-    void* GetWorkSpace() { return _workspace; }
-
-    curandGenerator_t& GetRandGenerator() { return _gen; }
-
-    cudaStream_t GetCurrentStream()
-    {
-        // get current pytorch stream.
-        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-        return stream;
-    }
-
-    cudaStream_t GetNewStream() { return at::cuda::getStreamFromPool(); }
-
-    cublasHandle_t GetCublasHandle() { return _cublasHandle; }
-
-    std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
-    {
-        uint64_t offset = _curr_offset;
-        _curr_offset += offset_inc;
-        return std::pair<uint64_t, uint64_t>(_seed, offset);
-    }
-
-    void SetSeed(uint64_t new_seed) { _seed = new_seed; }
-
-    void TestGemmFP16(bool test_gemm, int batch_size, int seq_len, int head_num, int size_per_head)
-    {
-        // avoid rerun.
-        if (_gemm_algos.size() > 0) return;
-
-        if (test_gemm) {
-            cublasHandle_t handle = GetCublasHandle();
-
-            std::unique_ptr<GemmTest<__half>> test_qkv_fw(
-                new GemmTest<__half>(batch_size * seq_len,      // M
-                                     head_num * size_per_head,  // N
-                                     head_num * size_per_head,  // K
-                                     CUBLAS_OP_T,
-                                     CUBLAS_OP_N,
-                                     handle));
-
-            std::unique_ptr<GemmTest<__half>> test_inter(
-                new GemmTest<__half>(batch_size * seq_len,          // M
-                                     4 * head_num * size_per_head,  // N
-                                     head_num * size_per_head,      // K
-                                     CUBLAS_OP_T,
-                                     CUBLAS_OP_N,
-                                     handle));
-
-            std::unique_ptr<GemmTest<__half>> test_output(
-                new GemmTest<__half>(batch_size * seq_len,          // M
-                                     head_num * size_per_head,      // N
-                                     4 * head_num * size_per_head,  // K
-                                     CUBLAS_OP_T,
-                                     CUBLAS_OP_N,
-                                     handle));
-
-            std::unique_ptr<StridedGemmTest<__half>> test_attn_scores(
-                new StridedGemmTest<__half>(batch_size * head_num,  // batch
-                                            seq_len,                // M
-                                            seq_len,                // N
-                                            size_per_head,          // K
-                                            CUBLAS_OP_T,
-                                            CUBLAS_OP_N,
-                                            handle));
-
-            std::unique_ptr<StridedGemmTest<__half>> test_attn_context(
-                new StridedGemmTest<__half>(batch_size * head_num,  // batch
-                                            size_per_head,          // M
-                                            seq_len,                // N
-                                            seq_len,                // K
-                                            CUBLAS_OP_N,
-                                            CUBLAS_OP_N,
-                                            handle));
-
-            _gemm_algos.push_back(test_qkv_fw->TestAlgo(100));
-            _gemm_algos.push_back(test_inter->TestAlgo(100));
-            _gemm_algos.push_back(test_output->TestAlgo(100));
-            _gemm_algos.push_back(test_attn_scores->TestAlgo(100));
-            _gemm_algos.push_back(test_attn_context->TestAlgo(100));
-        } else {
-            // Use default algo.
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-        }
-    }
-
-    const std::vector<std::array<int, 3>>& GetGemmAlgos() const { return _gemm_algos; }
-
-private:
-    curandGenerator_t _gen;
-    cublasHandle_t _cublasHandle;
-    void* _workspace;
-    uint64_t _seed;
-    uint64_t _curr_offset;
-    std::vector<std::array<int, 3>> _gemm_algos;
-};
diff --git a/deepspeed/ops/csrc/includes/context_hip.h b/deepspeed/ops/csrc/includes/context_hip.h
deleted file mode 100644
index 258b2bc27482e78d4458326386e4ef487e45fd54..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/context_hip.h
+++ /dev/null
@@ -1,172 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#include <ATen/hip/HIPContext.h>
-#include <hip/hip_runtime_api.h>
-#include <cassert>
-#include <iostream>
-#include <vector>
-#include "rocblas.h"
-#include "hip/hip_runtime.h"
-#include "hiprand/hiprand.h"
-#include "gemm_test_hip.h"
-
-#define WARP_SIZE 32
-
-#define CUDA_CHECK(callstr)                                                                    \
-    {                                                                                          \
-        hipError_t error_code = callstr;                                                      \
-        if (error_code != hipSuccess) {                                                       \
-            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
-            assert(0);                                                                         \
-        }                                                                                      \
-    }
-
-#define CUDA_1D_KERNEL_LOOP(i, n) \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
-
-#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                                                          \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
-        for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
-
-#define DS_CUDA_NUM_THREADS 512
-#define DS_MAXIMUM_NUM_BLOCKS 262144
-
-inline int DS_GET_BLOCKS(const int N)
-{
-    return (std::max)(
-        (std::min)((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
-        // Use at least 1 block, since CUDA does not allow empty block
-        1);
-}
-
-class Context {
-public:
-    Context() : _workspace(nullptr), _seed(42), _curr_offset(0)
-    {
-        hiprandCreateGenerator(&_gen, HIPRAND_RNG_PSEUDO_DEFAULT);
-        hiprandSetPseudoRandomGeneratorSeed(_gen, 123);
-        if (rocblas_create_handle(&_cublasHandle) != rocblas_status_success) {
-            auto message = std::string("Fail to create cublas handle.");
-            std::cerr << message << std::endl;
-            throw std::runtime_error(message);
-        }
-    }
-
-    virtual ~Context()
-    {
-        rocblas_destroy_handle(_cublasHandle);
-        hipFree(_workspace);
-    }
-
-    static Context& Instance()
-    {
-        static Context _ctx;
-        return _ctx;
-    }
-
-    void SetWorkSpace(void* workspace)
-    {
-        if (!workspace) { throw std::runtime_error("Workspace is null."); }
-        _workspace = workspace;
-    }
-
-    void* GetWorkSpace() { return _workspace; }
-
-    hiprandGenerator_t& GetRandGenerator() { return _gen; }
-
-    hipStream_t GetCurrentStream()
-    {
-        // get current pytorch stream.
-        hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-        return stream;
-    }
-
-    hipStream_t GetNewStream() { return at::hip::getStreamFromPoolMasqueradingAsCUDA(); }
-
-    rocblas_handle GetCublasHandle() { return _cublasHandle; }
-
-    std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
-    {
-        uint64_t offset = _curr_offset;
-        _curr_offset += offset_inc;
-        return std::pair<uint64_t, uint64_t>(_seed, offset);
-    }
-
-    void SetSeed(uint64_t new_seed) { _seed = new_seed; }
-
-    void TestGemmFP16(bool test_gemm, int batch_size, int seq_len, int head_num, int size_per_head)
-    {
-        // avoid rerun.
-        if (_gemm_algos.size() > 0) return;
-
-        if (test_gemm) {
-            rocblas_handle handle = GetCublasHandle();
-
-            std::unique_ptr<GemmTest<__half>> test_qkv_fw(
-                new GemmTest<__half>(batch_size * seq_len,      // M
-                                     head_num * size_per_head,  // N
-                                     head_num * size_per_head,  // K
-                                     rocblas_operation_transpose,
-                                     rocblas_operation_none,
-                                     handle));
-
-            std::unique_ptr<GemmTest<__half>> test_inter(
-                new GemmTest<__half>(batch_size * seq_len,          // M
-                                     4 * head_num * size_per_head,  // N
-                                     head_num * size_per_head,      // K
-                                     rocblas_operation_transpose,
-                                     rocblas_operation_none,
-                                     handle));
-
-            std::unique_ptr<GemmTest<__half>> test_output(
-                new GemmTest<__half>(batch_size * seq_len,          // M
-                                     head_num * size_per_head,      // N
-                                     4 * head_num * size_per_head,  // K
-                                     rocblas_operation_transpose,
-                                     rocblas_operation_none,
-                                     handle));
-
-            std::unique_ptr<StridedGemmTest<__half>> test_attn_scores(
-                new StridedGemmTest<__half>(batch_size * head_num,  // batch
-                                            seq_len,                // M
-                                            seq_len,                // N
-                                            size_per_head,          // K
-                                            rocblas_operation_transpose,
-                                            rocblas_operation_none,
-                                            handle));
-
-            std::unique_ptr<StridedGemmTest<__half>> test_attn_context(
-                new StridedGemmTest<__half>(batch_size * head_num,  // batch
-                                            size_per_head,          // M
-                                            seq_len,                // N
-                                            seq_len,                // K
-                                            rocblas_operation_none,
-                                            rocblas_operation_none,
-                                            handle));
-
-            _gemm_algos.push_back(test_qkv_fw->TestAlgo(100));
-            _gemm_algos.push_back(test_inter->TestAlgo(100));
-            _gemm_algos.push_back(test_output->TestAlgo(100));
-            _gemm_algos.push_back(test_attn_scores->TestAlgo(100));
-            _gemm_algos.push_back(test_attn_context->TestAlgo(100));
-        } else {
-            // Use default algo.
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-        }
-    }
-
-    const std::vector<std::array<int, 3>>& GetGemmAlgos() const { return _gemm_algos; }
-
-private:
-    hiprandGenerator_t _gen;
-    rocblas_handle _cublasHandle;
-    void* _workspace;
-    uint64_t _seed;
-    uint64_t _curr_offset;
-    std::vector<std::array<int, 3>> _gemm_algos;
-};
diff --git a/deepspeed/ops/csrc/includes/cpu_adagrad.h b/deepspeed/ops/csrc/includes/cpu_adagrad.h
deleted file mode 100644
index 6c21b7c8e82d36ae62c11d3cc8dfbc153af19549..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/cpu_adagrad.h
+++ /dev/null
@@ -1,150 +0,0 @@
-#pragma once
-
-#define NOMINMAX  // Windows idiosyncrasy
-                  // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
-
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-#include <stdio.h>
-#include <cassert>
-#include "cuda.h"
-#include "custom_cuda_layers.h"
-#include "simd.h"
-
-#define STEP(SPAN)                                \
-    void Step_##SPAN(float* _params,              \
-                     float* grads,                \
-                     float* _exp_avg_sq,          \
-                     size_t _param_size,          \
-                     __half* dev_param = nullptr, \
-                     bool half_precision = false);
-
-class Adagrad_Optimizer {
-public:
-    Adagrad_Optimizer(float alpha = 1e-2, float eps = 1e-8, float weight_decay = 0)
-        : _alpha(alpha), _eps(eps), _weight_decay(weight_decay), _buf_index(false)
-    {
-        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
-        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-
-        _streams[0] = Context::Instance().GetCurrentStream();
-        _streams[1] = Context::Instance().GetNewStream();
-    }
-    ~Adagrad_Optimizer()
-    {
-        cudaFreeHost(_doubled_buffer[0]);
-        cudaFreeHost(_doubled_buffer[1]);
-    }
-#if defined(__AVX512__) or defined(__AVX256__)
-    template <int span>
-    void Step_AVX(size_t* rounded_size,
-                  float* _params,
-                  float* grads,
-                  float* _exp_avg_sq,
-                  size_t param_size,
-                  __half* dev_param = nullptr,
-                  bool half_precision = false);
-#endif
-    STEP(1)
-    STEP(4)
-    STEP(8)
-    inline void SynchronizeStreams()
-    {
-        for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
-    }
-    inline void IncrementStep(size_t step)
-    {
-        _step++;
-        if (_step != step) { _step = step; }
-    }
-    inline void update_state(float lr, float epsilon, float weight_decay)
-    {
-        _alpha = lr;
-        _eps = epsilon;
-        _weight_decay = weight_decay;
-    }
-
-private:
-    float _alpha;
-    float _eps;
-    float _weight_decay;
-
-    float _betta1_t;
-    float _betta2_t;
-    size_t _step;
-
-    float* _doubled_buffer[2];
-    bool _buf_index;
-
-    cudaStream_t _streams[2];
-};
-
-#if defined(__AVX512__) or defined(__AVX256__)
-template <int span>
-void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
-                                 float* _params,
-                                 float* grads,
-                                 float* _exp_avg_sq,
-                                 size_t _param_size,
-                                 __half* dev_params,
-                                 bool half_precision)
-{
-    size_t new_rounded_size = 0;
-    AVX_Data eps_4;
-    eps_4.data = SIMD_SET(_eps);
-
-    float step_size = -1 * _alpha;
-    AVX_Data step_size_4;
-    step_size_4.data = SIMD_SET(step_size);
-
-    AVX_Data weight_decay4;
-    if (_weight_decay > 0) weight_decay4.data = SIMD_SET(_weight_decay);
-    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
-    for (size_t t = 0; t < new_rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
-        size_t offset = copy_size + t;
-        if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
-            AVX_Data grad_4[span];
-            simd_load<span>(grad_4, grads + i, half_precision);
-
-            AVX_Data momentum_4[span];
-            simd_load<span>(momentum_4, grads + i, false);
-
-            AVX_Data variance_4[span];
-            simd_load<span>(variance_4, _exp_avg_sq + i, false);
-
-            AVX_Data param_4[span];
-            simd_load<span>(param_4, _params + i, half_precision);
-
-            if (_weight_decay > 0) { simd_fma<span>(grad_4, param_4, weight_decay4, grad_4); }
-
-            simd_fma<span>(variance_4, grad_4, grad_4, variance_4);
-            simd_sqrt<span>(grad_4, variance_4);
-            simd_add<span>(grad_4, grad_4, eps_4);
-            simd_div<span>(grad_4, momentum_4, grad_4);
-            simd_fma<span>(param_4, grad_4, step_size_4, param_4);
-
-            simd_store<span>(_params + i, param_4, half_precision);
-            if (dev_params) {
-                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
-            }
-            simd_store<span>(_exp_avg_sq + i, variance_4, false);
-        }
-
-        if (dev_params) {
-            if (half_precision)
-                launch_param_update_half(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-            else
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-
-            _buf_index = !_buf_index;
-        }
-    }
-    *rounded_size = new_rounded_size;
-}
-#endif
diff --git a/deepspeed/ops/csrc/includes/cpu_adagrad_hip.h b/deepspeed/ops/csrc/includes/cpu_adagrad_hip.h
deleted file mode 100644
index cb012a1c9a9003c89235e920346ddae584462202..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/cpu_adagrad_hip.h
+++ /dev/null
@@ -1,151 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#define NOMINMAX  // Windows idiosyncrasy
-                  // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
-
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime_api.h>
-#include <stdio.h>
-#include <cassert>
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-#include "simd.h"
-
-#define STEP(SPAN)                                \
-    void Step_##SPAN(float* _params,              \
-                     float* grads,                \
-                     float* _exp_avg_sq,          \
-                     size_t _param_size,          \
-                     __half* dev_param = nullptr, \
-                     bool half_precision = false);
-
-class Adagrad_Optimizer {
-public:
-    Adagrad_Optimizer(float alpha = 1e-2, float eps = 1e-8, float weight_decay = 0)
-        : _alpha(alpha), _eps(eps), _weight_decay(weight_decay), _buf_index(false)
-    {
-        hipHostMalloc((void**)_doubled_buffer, TILE * sizeof(float));
-        hipHostMalloc((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-
-        _streams[0] = Context::Instance().GetCurrentStream();
-        _streams[1] = Context::Instance().GetNewStream();
-    }
-    ~Adagrad_Optimizer()
-    {
-        hipHostFree(_doubled_buffer[0]);
-        hipHostFree(_doubled_buffer[1]);
-    }
-#if defined(__AVX512__) or defined(__AVX256__)
-    template <int span>
-    void Step_AVX(size_t* rounded_size,
-                  float* _params,
-                  float* grads,
-                  float* _exp_avg_sq,
-                  size_t param_size,
-                  __half* dev_param = nullptr,
-                  bool half_precision = false);
-#endif
-    STEP(1)
-    STEP(4)
-    STEP(8)
-    inline void SynchronizeStreams()
-    {
-        for (int i = 0; i < 2; i++) hipStreamSynchronize(_streams[i]);
-    }
-    inline void IncrementStep(size_t step)
-    {
-        _step++;
-        if (_step != step) { _step = step; }
-    }
-    inline void update_state(float lr, float epsilon, float weight_decay)
-    {
-        _alpha = lr;
-        _eps = epsilon;
-        _weight_decay = weight_decay;
-    }
-
-private:
-    float _alpha;
-    float _eps;
-    float _weight_decay;
-
-    float _betta1_t;
-    float _betta2_t;
-    size_t _step;
-
-    float* _doubled_buffer[2];
-    bool _buf_index;
-
-    hipStream_t _streams[2];
-};
-
-#if defined(__AVX512__) or defined(__AVX256__)
-template <int span>
-void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
-                                 float* _params,
-                                 float* grads,
-                                 float* _exp_avg_sq,
-                                 size_t _param_size,
-                                 __half* dev_params,
-                                 bool half_precision)
-{
-    size_t new_rounded_size = 0;
-    AVX_Data eps_4;
-    eps_4.data = SIMD_SET(_eps);
-
-    float step_size = -1 * _alpha;
-    AVX_Data step_size_4;
-    step_size_4.data = SIMD_SET(step_size);
-
-    AVX_Data weight_decay4;
-    if (_weight_decay > 0) weight_decay4.data = SIMD_SET(_weight_decay);
-    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
-    for (size_t t = 0; t < new_rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
-        size_t offset = copy_size + t;
-        if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
-            AVX_Data grad_4[span];
-            simd_load<span>(grad_4, grads + i, half_precision);
-
-            AVX_Data momentum_4[span];
-            simd_load<span>(momentum_4, grads + i, false);
-
-            AVX_Data variance_4[span];
-            simd_load<span>(variance_4, _exp_avg_sq + i, false);
-
-            AVX_Data param_4[span];
-            simd_load<span>(param_4, _params + i, half_precision);
-
-            if (_weight_decay > 0) { simd_fma<span>(grad_4, param_4, weight_decay4, grad_4); }
-
-            simd_fma<span>(variance_4, grad_4, grad_4, variance_4);
-            simd_sqrt<span>(grad_4, variance_4);
-            simd_add<span>(grad_4, grad_4, eps_4);
-            simd_div<span>(grad_4, momentum_4, grad_4);
-            simd_fma<span>(param_4, grad_4, step_size_4, param_4);
-
-            simd_store<span>(_params + i, param_4, half_precision);
-            if (dev_params) {
-                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
-            }
-            simd_store<span>(_exp_avg_sq + i, variance_4, false);
-        }
-
-        if (dev_params) {
-            if (half_precision)
-                launch_param_update_half(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-            else
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-
-            _buf_index = !_buf_index;
-        }
-    }
-    *rounded_size = new_rounded_size;
-}
-#endif
diff --git a/deepspeed/ops/csrc/includes/cpu_adam.h b/deepspeed/ops/csrc/includes/cpu_adam.h
deleted file mode 100644
index 09677c6842dee6a4a9abe835c245864f07739aa9..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/cpu_adam.h
+++ /dev/null
@@ -1,225 +0,0 @@
-#pragma once
-
-#define NOMINMAX  // Windows idiosyncrasy
-                  // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
-
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-#include <stdio.h>
-#include <cassert>
-#include "cuda.h"
-#include "custom_cuda_layers.h"
-#include "simd.h"
-
-#define STEP(SPAN)                                \
-    void Step_##SPAN(float* _params,              \
-                     float* grads,                \
-                     float* _exp_avg,             \
-                     float* _exp_avg_sq,          \
-                     size_t _param_size,          \
-                     __half* dev_param = nullptr, \
-                     bool half_precision = false);
-
-class Adam_Optimizer {
-public:
-    Adam_Optimizer(float alpha = 1e-3,
-                   float betta1 = 0.9,
-                   float betta2 = 0.999,
-                   float eps = 1e-8,
-                   float weight_decay = 0,
-                   bool adamw_mode = true)
-        : _alpha(alpha),
-          _betta1(betta1),
-          _betta2(betta2),
-          _eps(eps),
-          _weight_decay(weight_decay),
-          _betta1_t(1.0),
-          _betta2_t(1.0),
-          _step(0),
-          _buf_index(false),
-          _adamw_mode(adamw_mode)
-    {
-        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
-        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-
-        _streams[0] = Context::Instance().GetCurrentStream();
-        _streams[1] = Context::Instance().GetNewStream();
-    }
-    ~Adam_Optimizer()
-    {
-        cudaFreeHost(_doubled_buffer[0]);
-        cudaFreeHost(_doubled_buffer[1]);
-    }
-#if defined(__AVX512__) or defined(__AVX256__)
-    template <int span>
-    void Step_AVX(size_t* rounded_size,
-                  float* _params,
-                  float* grads,
-                  float* _exp_avg,
-                  float* _exp_avg_sq,
-                  size_t param_size,
-                  __half* dev_param = nullptr,
-                  bool half_precision = false);
-#endif
-    STEP(1)
-    STEP(4)
-    STEP(8)
-    inline void SynchronizeStreams()
-    {
-        for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
-    }
-    inline void IncrementStep(size_t step, float beta1, float beta2)
-    {
-        if (beta1 != _betta1 || beta2 != _betta2) {
-            _step = step;
-            _betta1 = beta1;
-            _betta2 = beta2;
-            _betta1_t = std::pow(_betta1, step);
-            _betta2_t = std::pow(_betta2, step);
-        } else {
-            _step++;
-            if (_step != step) {
-                _betta1_t = std::pow(_betta1, step);
-                _betta2_t = std::pow(_betta2, step);
-                _step = step;
-            } else {
-                _betta1_t *= _betta1;
-                _betta2_t *= _betta2;
-            }
-        }
-    }
-    inline void update_state(float lr, float epsilon, float weight_decay, bool bias_correction)
-    {
-        _alpha = lr;
-        _eps = epsilon;
-        _weight_decay = weight_decay;
-
-        _bias_correction1 = 1.0f;
-        _bias_correction2 = 1.0f;
-        if (bias_correction == 1) {
-            _bias_correction1 = 1 - _betta1_t;
-            _bias_correction2 = 1 / sqrt(1 - _betta2_t);
-        }
-    }
-
-private:
-    float _alpha;
-    float _betta1;
-    float _betta2;
-    float _eps;
-    float _weight_decay;
-
-    float _betta1_t;
-    float _betta2_t;
-    size_t _step;
-
-    float _bias_correction1;
-    float _bias_correction2;
-
-    float* _doubled_buffer[2];
-    bool _buf_index;
-    bool _adamw_mode;
-
-    cudaStream_t _streams[2];
-};
-
-#if defined(__AVX512__) or defined(__AVX256__)
-template <int span>
-void Adam_Optimizer::Step_AVX(size_t* rounded_size,
-                              float* _params,
-                              float* grads,
-                              float* _exp_avg,
-                              float* _exp_avg_sq,
-                              size_t _param_size,
-                              __half* dev_params,
-                              bool half_precision)
-{
-    size_t new_rounded_size = 0;
-
-    AVX_Data betta1_4;
-    betta1_4.data = SIMD_SET(_betta1);
-    AVX_Data betta2_4;
-    betta2_4.data = SIMD_SET(_betta2);
-
-    float betta1_minus1 = 1 - _betta1;
-    float betta2_minus1 = 1 - _betta2;
-    AVX_Data betta1_minus1_4;
-    betta1_minus1_4.data = SIMD_SET(betta1_minus1);
-    AVX_Data betta2_minus1_4;
-    betta2_minus1_4.data = SIMD_SET(betta2_minus1);
-
-    AVX_Data bias2_sqrt;
-    bias2_sqrt.data = SIMD_SET(_bias_correction2);
-
-    AVX_Data eps_4;
-    eps_4.data = SIMD_SET(_eps);
-
-    float step_size = -1 * _alpha / _bias_correction1;
-    AVX_Data step_size_4;
-    step_size_4.data = SIMD_SET(step_size);
-
-    float w_decay = -1 * _alpha * _weight_decay;
-    AVX_Data weight_decay4;
-    if (_weight_decay > 0)
-        weight_decay4.data = (_adamw_mode ? SIMD_SET(w_decay) : SIMD_SET(_weight_decay));
-    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
-    for (size_t t = 0; t < new_rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
-        size_t offset = copy_size + t;
-        if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
-            AVX_Data grad_4[span];
-            simd_load<span>(grad_4, grads + i, half_precision);
-
-            AVX_Data momentum_4[span];
-            simd_load<span>(momentum_4, _exp_avg + i, false);
-
-            AVX_Data variance_4[span];
-            simd_load<span>(variance_4, _exp_avg_sq + i, false);
-
-            AVX_Data param_4[span];
-            simd_load<span>(param_4, _params + i, half_precision);
-
-            if (_weight_decay > 0 && !_adamw_mode) {
-                simd_fma<span>(grad_4, param_4, weight_decay4, grad_4);
-            }
-
-            simd_mul<span>(momentum_4, momentum_4, betta1_4);
-            simd_fma<span>(momentum_4, grad_4, betta1_minus1_4, momentum_4);
-            simd_mul<span>(variance_4, variance_4, betta2_4);
-            simd_mul<span>(grad_4, grad_4, grad_4);
-            simd_fma<span>(variance_4, grad_4, betta2_minus1_4, variance_4);
-            simd_sqrt<span>(grad_4, variance_4);
-            simd_fma<span>(grad_4, grad_4, bias2_sqrt, eps_4);
-            simd_div<span>(grad_4, momentum_4, grad_4);
-
-            if (_weight_decay > 0 && _adamw_mode) {
-                simd_fma<span>(param_4, param_4, weight_decay4, param_4);
-            }
-
-            simd_fma<span>(param_4, grad_4, step_size_4, param_4);
-
-            simd_store<span>(_params + i, param_4, half_precision);
-            if (dev_params) {
-                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
-            }
-            simd_store<span>(_exp_avg + i, momentum_4, false);
-            simd_store<span>(_exp_avg_sq + i, variance_4, false);
-        }
-
-        if (dev_params) {
-            if (half_precision)
-                launch_param_update_half(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-            else
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-
-            _buf_index = !_buf_index;
-        }
-    }
-    *rounded_size = new_rounded_size;
-}
-#endif
diff --git a/deepspeed/ops/csrc/includes/cpu_adam_hip.h b/deepspeed/ops/csrc/includes/cpu_adam_hip.h
deleted file mode 100644
index 3622f34cb1558c39e92fa7ac6d2752d37b96974d..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/cpu_adam_hip.h
+++ /dev/null
@@ -1,226 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#define NOMINMAX  // Windows idiosyncrasy
-                  // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
-
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime_api.h>
-#include <stdio.h>
-#include <cassert>
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-#include "simd.h"
-
-#define STEP(SPAN)                                \
-    void Step_##SPAN(float* _params,              \
-                     float* grads,                \
-                     float* _exp_avg,             \
-                     float* _exp_avg_sq,          \
-                     size_t _param_size,          \
-                     __half* dev_param = nullptr, \
-                     bool half_precision = false);
-
-class Adam_Optimizer {
-public:
-    Adam_Optimizer(float alpha = 1e-3,
-                   float betta1 = 0.9,
-                   float betta2 = 0.999,
-                   float eps = 1e-8,
-                   float weight_decay = 0,
-                   bool adamw_mode = true)
-        : _alpha(alpha),
-          _betta1(betta1),
-          _betta2(betta2),
-          _eps(eps),
-          _weight_decay(weight_decay),
-          _betta1_t(1.0),
-          _betta2_t(1.0),
-          _step(0),
-          _buf_index(false),
-          _adamw_mode(adamw_mode)
-    {
-        hipHostMalloc((void**)_doubled_buffer, TILE * sizeof(float));
-        hipHostMalloc((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-
-        _streams[0] = Context::Instance().GetCurrentStream();
-        _streams[1] = Context::Instance().GetNewStream();
-    }
-    ~Adam_Optimizer()
-    {
-        hipHostFree(_doubled_buffer[0]);
-        hipHostFree(_doubled_buffer[1]);
-    }
-#if defined(__AVX512__) or defined(__AVX256__)
-    template <int span>
-    void Step_AVX(size_t* rounded_size,
-                  float* _params,
-                  float* grads,
-                  float* _exp_avg,
-                  float* _exp_avg_sq,
-                  size_t param_size,
-                  __half* dev_param = nullptr,
-                  bool half_precision = false);
-#endif
-    STEP(1)
-    STEP(4)
-    STEP(8)
-    inline void SynchronizeStreams()
-    {
-        for (int i = 0; i < 2; i++) hipStreamSynchronize(_streams[i]);
-    }
-    inline void IncrementStep(size_t step, float beta1, float beta2)
-    {
-        if (beta1 != _betta1 || beta2 != _betta2) {
-            _step = step;
-            _betta1 = beta1;
-            _betta2 = beta2;
-            _betta1_t = std::pow(_betta1, step);
-            _betta2_t = std::pow(_betta2, step);
-        } else {
-            _step++;
-            if (_step != step) {
-                _betta1_t = std::pow(_betta1, step);
-                _betta2_t = std::pow(_betta2, step);
-                _step = step;
-            } else {
-                _betta1_t *= _betta1;
-                _betta2_t *= _betta2;
-            }
-        }
-    }
-    inline void update_state(float lr, float epsilon, float weight_decay, bool bias_correction)
-    {
-        _alpha = lr;
-        _eps = epsilon;
-        _weight_decay = weight_decay;
-
-        _bias_correction1 = 1.0f;
-        _bias_correction2 = 1.0f;
-        if (bias_correction == 1) {
-            _bias_correction1 = 1 - _betta1_t;
-            _bias_correction2 = 1 / sqrt(1 - _betta2_t);
-        }
-    }
-
-private:
-    float _alpha;
-    float _betta1;
-    float _betta2;
-    float _eps;
-    float _weight_decay;
-
-    float _betta1_t;
-    float _betta2_t;
-    size_t _step;
-
-    float _bias_correction1;
-    float _bias_correction2;
-
-    float* _doubled_buffer[2];
-    bool _buf_index;
-    bool _adamw_mode;
-
-    hipStream_t _streams[2];
-};
-
-#if defined(__AVX512__) or defined(__AVX256__)
-template <int span>
-void Adam_Optimizer::Step_AVX(size_t* rounded_size,
-                              float* _params,
-                              float* grads,
-                              float* _exp_avg,
-                              float* _exp_avg_sq,
-                              size_t _param_size,
-                              __half* dev_params,
-                              bool half_precision)
-{
-    size_t new_rounded_size = 0;
-
-    AVX_Data betta1_4;
-    betta1_4.data = SIMD_SET(_betta1);
-    AVX_Data betta2_4;
-    betta2_4.data = SIMD_SET(_betta2);
-
-    float betta1_minus1 = 1 - _betta1;
-    float betta2_minus1 = 1 - _betta2;
-    AVX_Data betta1_minus1_4;
-    betta1_minus1_4.data = SIMD_SET(betta1_minus1);
-    AVX_Data betta2_minus1_4;
-    betta2_minus1_4.data = SIMD_SET(betta2_minus1);
-
-    AVX_Data bias2_sqrt;
-    bias2_sqrt.data = SIMD_SET(_bias_correction2);
-
-    AVX_Data eps_4;
-    eps_4.data = SIMD_SET(_eps);
-
-    float step_size = -1 * _alpha / _bias_correction1;
-    AVX_Data step_size_4;
-    step_size_4.data = SIMD_SET(step_size);
-
-    float w_decay = -1 * _alpha * _weight_decay;
-    AVX_Data weight_decay4;
-    if (_weight_decay > 0)
-        weight_decay4.data = (_adamw_mode ? SIMD_SET(w_decay) : SIMD_SET(_weight_decay));
-    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
-    for (size_t t = 0; t < new_rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
-        size_t offset = copy_size + t;
-        if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
-            AVX_Data grad_4[span];
-            simd_load<span>(grad_4, grads + i, half_precision);
-
-            AVX_Data momentum_4[span];
-            simd_load<span>(momentum_4, _exp_avg + i, false);
-
-            AVX_Data variance_4[span];
-            simd_load<span>(variance_4, _exp_avg_sq + i, false);
-
-            AVX_Data param_4[span];
-            simd_load<span>(param_4, _params + i, half_precision);
-
-            if (_weight_decay > 0 && !_adamw_mode) {
-                simd_fma<span>(grad_4, param_4, weight_decay4, grad_4);
-            }
-
-            simd_mul<span>(momentum_4, momentum_4, betta1_4);
-            simd_fma<span>(momentum_4, grad_4, betta1_minus1_4, momentum_4);
-            simd_mul<span>(variance_4, variance_4, betta2_4);
-            simd_mul<span>(grad_4, grad_4, grad_4);
-            simd_fma<span>(variance_4, grad_4, betta2_minus1_4, variance_4);
-            simd_sqrt<span>(grad_4, variance_4);
-            simd_fma<span>(grad_4, grad_4, bias2_sqrt, eps_4);
-            simd_div<span>(grad_4, momentum_4, grad_4);
-
-            if (_weight_decay > 0 && _adamw_mode) {
-                simd_fma<span>(param_4, param_4, weight_decay4, param_4);
-            }
-
-            simd_fma<span>(param_4, grad_4, step_size_4, param_4);
-
-            simd_store<span>(_params + i, param_4, half_precision);
-            if (dev_params) {
-                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
-            }
-            simd_store<span>(_exp_avg + i, momentum_4, false);
-            simd_store<span>(_exp_avg_sq + i, variance_4, false);
-        }
-
-        if (dev_params) {
-            if (half_precision)
-                launch_param_update_half(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-            else
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-
-            _buf_index = !_buf_index;
-        }
-    }
-    *rounded_size = new_rounded_size;
-}
-#endif
diff --git a/deepspeed/ops/csrc/includes/cublas_wrappers.h b/deepspeed/ops/csrc/includes/cublas_wrappers.h
deleted file mode 100644
index 9bb6cc30f6ae5c00013e75051118e2618a18312c..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/cublas_wrappers.h
+++ /dev/null
@@ -1,87 +0,0 @@
-#pragma once
-
-#include <assert.h>
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#ifndef __HIP_PLATFORM_HCC__
-#include <mma.h>
-#endif
-#include <stdio.h>
-
-int cublas_gemm_ex(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const float* A,
-                   const float* B,
-                   float* C,
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
-#else
-                   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
-#endif
-
-int cublas_gemm_ex(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
-#else
-                   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-
-int cublas_strided_batched_gemm(cublasHandle_t handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const float* A,
-                                const float* B,
-                                float* C,
-                                cublasOperation_t op_A,
-                                cublasOperation_t op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-#ifdef __HIP_PLATFORM_HCC__
-                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
-#else
-                                cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
-#endif
-
-int cublas_strided_batched_gemm(cublasHandle_t handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
-                                cublasOperation_t op_A,
-                                cublasOperation_t op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-#ifdef __HIP_PLATFORM_HCC__
-                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
-#else
-                                cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
diff --git a/deepspeed/ops/csrc/includes/cublas_wrappers_hip.h b/deepspeed/ops/csrc/includes/cublas_wrappers_hip.h
deleted file mode 100644
index ddfa1867cf78baa62954bc16b325121e8f569dbb..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/cublas_wrappers_hip.h
+++ /dev/null
@@ -1,88 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#include <assert.h>
-#include <rocblas.h>
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime.h>
-#ifndef __HIP_PLATFORM_HCC__
-#include <mma.h>
-#endif
-#include <stdio.h>
-
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const float* A,
-                   const float* B,
-                   float* C,
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
-#else
-                   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
-#endif
-
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
-#else
-                   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const float* A,
-                                const float* B,
-                                float* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-#ifdef __HIP_PLATFORM_HCC__
-                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
-#else
-                                cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
-#endif
-
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-#ifdef __HIP_PLATFORM_HCC__
-                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
-#else
-                                cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
diff --git a/deepspeed/ops/csrc/includes/custom_cuda_layers.h b/deepspeed/ops/csrc/includes/custom_cuda_layers.h
deleted file mode 100644
index 30c633f72914f97fc0aa9f55f86cf963e7b49eea..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/custom_cuda_layers.h
+++ /dev/null
@@ -1,303 +0,0 @@
-#pragma once
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#ifdef __HIP_PLATFORM_HCC__
-#define HALF_PRECISION_AVAILABLE = 1
-#include <hip/hip_cooperative_groups.h>
-#else
-#if __CUDA_ARCH__ >= 700
-#define HALF_PRECISION_AVAILABLE = 1
-#endif
-#include <cooperative_groups.h>
-#endif
-#include <curand_kernel.h>
-
-#include "context.h"
-#include "cublas_wrappers.h"
-
-#define CUDA_CHECK(callstr)                                                                    \
-    {                                                                                          \
-        cudaError_t error_code = callstr;                                                      \
-        if (error_code != cudaSuccess) {                                                       \
-            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
-            assert(0);                                                                         \
-        }                                                                                      \
-    }
-
-#define MAX_THREADS 1024
-#define THREADS 256
-
-#define MAX_THREAD_STRIDE 32
-#define TILE_DIM 32
-
-// Maximum sequence-length support based on the number of threads (2048) allowed in each block and
-// this MAX is 8K For higher sequence length we need to use higher Max, like for 64K : 32
-#define MAX_THREAD_ITERATIONS 8  // Maximum 8K
-#define MAX_WARP_NUM 32
-
-#define MAX_REGISTERS 256
-
-#define MAX_REG 256
-
-#define WARP_SIZE_BITS 5
-
-template <typename T>
-void launch_quantize_kernel(T* vals,
-                            int total_count,
-                            int group_num,
-                            int num_bits,
-                            cudaStream_t stream);
-template <typename T>
-void launch_sr_quantize_kernel(T* vals,
-                               int total_count,
-                               int group_num,
-                               int num_bits,
-                               cudaStream_t stream);
-template <typename T>
-void launch_quantize_kernel_asym(T* vals,
-                                 int total_count,
-                                 int group_num,
-                                 int num_bits,
-                                 cudaStream_t stream);
-template <typename T>
-void launch_sr_quantize_kernel_asym(T* vals,
-                                    int total_count,
-                                    int group_num,
-                                    int num_bits,
-                                    cudaStream_t stream);
-// Fused bias add with gelu activation
-template <typename T>
-void launch_bias_gelu(const T* input,
-                      const T* bias,
-                      T* output,
-                      int intermediate_size,
-                      int batch_size,
-                      cudaStream_t stream);
-
-template <typename T>
-void launch_gelu(const T* input,
-                 T* output,
-                 int intermediate_size,
-                 int batch_size,
-                 cudaStream_t stream);
-
-template <typename T>
-void launch_d_gelu(T* d_output,
-                   const T* input,
-                   const T* bias,
-                   int intermediate_size,
-                   int batch_size,
-                   cudaStream_t stream);
-
-// Custom fused bias add with layer normalization
-template <typename T>
-void launch_bias_residual_layer_norm(T* vals,
-                                     const T* residual,
-                                     const T* gamma,
-                                     const T* beta,
-                                     float epsilon,
-                                     int batch_size,
-                                     int hidden_dim,
-                                     cudaStream_t stream,
-                                     bool preLayerNorm,
-                                     bool training,
-                                     T* vars,
-                                     T* means);
-
-template <typename T>
-void launch_bias_residual_layer_norm(T* vals,
-                                     const T* residual,
-                                     const T* gamma,
-                                     const T* beta,
-                                     float epsilon,
-                                     int batch_size,
-                                     int hidden_dim,
-                                     cudaStream_t stream,
-                                     bool preLayerNorm,
-                                     bool training,
-                                     T* vars);
-
-template <typename T>
-void launch_layerNorm_backward_fused_add(const T* out_grad1,
-                                         const T* out_grad2,
-                                         const T* X_data,
-                                         const T* vars,
-                                         const T* means,
-                                         const T* gamma,
-                                         T* gamma_grad,
-                                         T* betta_grad,
-                                         T* inp_grad,
-                                         int batch_size,
-                                         int hidden_dim,
-                                         cudaStream_t stream[2]);
-template <typename T>
-void launch_layerNorm_backward_fused_add(const T* out_grad1,
-                                         const T* out_grad2,
-                                         const T* vals_hat,
-                                         const T* vars,
-                                         const T* gamma,
-                                         T* gamma_grad,
-                                         T* betta_grad,
-                                         T* inp_grad,
-                                         int batch_size,
-                                         int hidden_dim,
-                                         cudaStream_t stream[2],
-                                         bool invertible = false,
-                                         const T* betta = nullptr);
-
-template <typename T>
-void launch_layerNorm_backward(const T* out_grad,
-                               const T* X_data,
-                               const T* vars,
-                               const T* means,
-                               const T* gamma,
-                               T* gamma_grad,
-                               T* betta_grad,
-                               T* inp_grad,
-                               int batch_size,
-                               int hidden_dim,
-                               cudaStream_t stream[2]);
-
-template <typename T>
-void launch_layerNorm_backward(const T* out_grad,
-                               const T* vals_hat,
-                               const T* vars,
-                               const T* gamma,
-                               T* gamma_grad,
-                               T* betta_grad,
-                               T* inp_grad,
-                               int batch_size,
-                               int hidden_dim,
-                               cudaStream_t stream[2],
-                               bool invertible = false,
-                               const T* betta = nullptr);
-
-template <typename T>
-void launch_layerNorm_backward_nreversible(const T* out_grad,
-                                           const T* vals,
-                                           const T* out_grad_trans,
-                                           const T* vals_trans,
-                                           const T* means,
-                                           const T* vars,
-                                           const T* gamma,
-                                           T* gamma_grad,
-                                           T* betta_grad,
-                                           T* inp_grad,
-                                           int batch_size,
-                                           int hidden_dim,
-                                           cudaStream_t stream[2]);
-
-template <typename T>
-void Transpose(const T* inp_mat, T* out_mat, int rows, int cols, cudaStream_t stream);
-
-template <typename T>
-void launch_attn_softmax_backward(T* out_grad,
-                                  const T* soft_inp,
-                                  int batch_size,
-                                  int heads,
-                                  int seq_length,
-                                  cudaStream_t stream);
-
-template <typename T>
-void launch_attn_softmax_backward_v2(T* out_grad,
-                                     const T* soft_inp,
-                                     int batch_size,
-                                     int heads,
-                                     int seq_length,
-                                     cudaStream_t stream);
-
-// Custom softmax with scaling and attention mask addition
-template <typename T>
-void launch_attn_softmax(T* vals,
-                         const T* attn_mask,
-                         int batch_size,
-                         int heads,
-                         int sequence_length,
-                         cudaStream_t stream);
-
-template <typename T>
-void launch_transform_0213(T* output,
-                           const T* vals,
-                           int batch_size,
-                           int seq_length,
-                           int hidden_dim,
-                           int heads,
-                           cudaStream_t stream);
-
-// Custom bias add
-template <typename T>
-void launch_bias_add_transform_0213(T* outputs,
-                                    const T* vals,
-                                    const T* bias,
-                                    int batch_size,
-                                    int seq_length,
-                                    int hidden_dim,
-                                    int heads,
-                                    cudaStream_t stream,
-                                    int trans_count);
-
-// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
-template <typename T>
-void launch_transform4d_0213(T* out,
-                             const T* in,
-                             int batch_size,
-                             int heads,
-                             int seq_length,
-                             int hidden_dim,
-                             cudaStream_t stream,
-                             int trans_count);
-
-template <typename T>
-void launch_dropout(T* vals,
-                    const T* bias,
-                    uint8_t* mask,
-                    int batch,
-                    int dim,
-                    float ratio,
-                    cudaStream_t stream);
-
-template <typename T>
-void launch_dropout(T* vals_out,
-                    const T* vals,
-                    uint8_t* mask,
-                    int total_count,
-                    int dim,
-                    float ratio,
-                    cudaStream_t stream,
-                    bool bwd = false);
-
-template <typename T>
-void launch_dropout(T* out,
-                    const T* vals,
-                    const T* residual,
-                    const T* bias,
-                    uint8_t* mask,
-                    int batch,
-                    int dim,
-                    float ratio,
-                    cudaStream_t stream);
-
-template <typename T>
-void launch_dropout_grad(T* vals, uint8_t* mask, int total_count, float ratio, cudaStream_t stream);
-
-template <typename T>
-void launch_dropout_grad(T* vals_out,
-                         const T* vals,
-                         uint8_t* mask,
-                         int total_count,
-                         float ratio,
-                         cudaStream_t stream);
-
-template <typename T>
-void launch_fuse_transpose_bias_kernel(const T* inp,
-                                       T* out,
-                                       int rows,
-                                       int cols,
-                                       cudaStream_t stream);
-
-void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream);
-void launch_param_update_half(const float* input, __half* output, int size, cudaStream_t stream);
diff --git a/deepspeed/ops/csrc/includes/custom_hip_layers.h b/deepspeed/ops/csrc/includes/custom_hip_layers.h
deleted file mode 100644
index 9f48b31941b7985b4c2eabee02b610d1a5d9d3f8..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/custom_hip_layers.h
+++ /dev/null
@@ -1,304 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#ifdef __HIP_PLATFORM_HCC__
-#define HALF_PRECISION_AVAILABLE = 1
-#include <hip/hip_cooperative_groups.h>
-#else
-#if __CUDA_ARCH__ >= 700
-#define HALF_PRECISION_AVAILABLE = 1
-#endif
-#include <cooperative_groups.h>
-#endif
-#include <hiprand/hiprand_kernel.h>
-
-#include "context_hip.h"
-#include "cublas_wrappers_hip.h"
-
-#define CUDA_CHECK(callstr)                                                                    \
-    {                                                                                          \
-        hipError_t error_code = callstr;                                                      \
-        if (error_code != hipSuccess) {                                                       \
-            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
-            assert(0);                                                                         \
-        }                                                                                      \
-    }
-
-#define MAX_THREADS 1024
-#define THREADS 256
-
-#define MAX_THREAD_STRIDE 32
-#define TILE_DIM 32
-
-// Maximum sequence-length support based on the number of threads (2048) allowed in each block and
-// this MAX is 8K For higher sequence length we need to use higher Max, like for 64K : 32
-#define MAX_THREAD_ITERATIONS 8  // Maximum 8K
-#define MAX_WARP_NUM 32
-
-#define MAX_REGISTERS 256
-
-#define MAX_REG 256
-
-#define WARP_SIZE_BITS 5
-
-template <typename T>
-void launch_quantize_kernel(T* vals,
-                            int total_count,
-                            int group_num,
-                            int num_bits,
-                            hipStream_t stream);
-template <typename T>
-void launch_sr_quantize_kernel(T* vals,
-                               int total_count,
-                               int group_num,
-                               int num_bits,
-                               hipStream_t stream);
-template <typename T>
-void launch_quantize_kernel_asym(T* vals,
-                                 int total_count,
-                                 int group_num,
-                                 int num_bits,
-                                 hipStream_t stream);
-template <typename T>
-void launch_sr_quantize_kernel_asym(T* vals,
-                                    int total_count,
-                                    int group_num,
-                                    int num_bits,
-                                    hipStream_t stream);
-// Fused bias add with gelu activation
-template <typename T>
-void launch_bias_gelu(const T* input,
-                      const T* bias,
-                      T* output,
-                      int intermediate_size,
-                      int batch_size,
-                      hipStream_t stream);
-
-template <typename T>
-void launch_gelu(const T* input,
-                 T* output,
-                 int intermediate_size,
-                 int batch_size,
-                 hipStream_t stream);
-
-template <typename T>
-void launch_d_gelu(T* d_output,
-                   const T* input,
-                   const T* bias,
-                   int intermediate_size,
-                   int batch_size,
-                   hipStream_t stream);
-
-// Custom fused bias add with layer normalization
-template <typename T>
-void launch_bias_residual_layer_norm(T* vals,
-                                     const T* residual,
-                                     const T* gamma,
-                                     const T* beta,
-                                     float epsilon,
-                                     int batch_size,
-                                     int hidden_dim,
-                                     hipStream_t stream,
-                                     bool preLayerNorm,
-                                     bool training,
-                                     T* vars,
-                                     T* means);
-
-template <typename T>
-void launch_bias_residual_layer_norm(T* vals,
-                                     const T* residual,
-                                     const T* gamma,
-                                     const T* beta,
-                                     float epsilon,
-                                     int batch_size,
-                                     int hidden_dim,
-                                     hipStream_t stream,
-                                     bool preLayerNorm,
-                                     bool training,
-                                     T* vars);
-
-template <typename T>
-void launch_layerNorm_backward_fused_add(const T* out_grad1,
-                                         const T* out_grad2,
-                                         const T* X_data,
-                                         const T* vars,
-                                         const T* means,
-                                         const T* gamma,
-                                         T* gamma_grad,
-                                         T* betta_grad,
-                                         T* inp_grad,
-                                         int batch_size,
-                                         int hidden_dim,
-                                         hipStream_t stream[2]);
-template <typename T>
-void launch_layerNorm_backward_fused_add(const T* out_grad1,
-                                         const T* out_grad2,
-                                         const T* vals_hat,
-                                         const T* vars,
-                                         const T* gamma,
-                                         T* gamma_grad,
-                                         T* betta_grad,
-                                         T* inp_grad,
-                                         int batch_size,
-                                         int hidden_dim,
-                                         hipStream_t stream[2],
-                                         bool invertible = false,
-                                         const T* betta = nullptr);
-
-template <typename T>
-void launch_layerNorm_backward(const T* out_grad,
-                               const T* X_data,
-                               const T* vars,
-                               const T* means,
-                               const T* gamma,
-                               T* gamma_grad,
-                               T* betta_grad,
-                               T* inp_grad,
-                               int batch_size,
-                               int hidden_dim,
-                               hipStream_t stream[2]);
-
-template <typename T>
-void launch_layerNorm_backward(const T* out_grad,
-                               const T* vals_hat,
-                               const T* vars,
-                               const T* gamma,
-                               T* gamma_grad,
-                               T* betta_grad,
-                               T* inp_grad,
-                               int batch_size,
-                               int hidden_dim,
-                               hipStream_t stream[2],
-                               bool invertible = false,
-                               const T* betta = nullptr);
-
-template <typename T>
-void launch_layerNorm_backward_nreversible(const T* out_grad,
-                                           const T* vals,
-                                           const T* out_grad_trans,
-                                           const T* vals_trans,
-                                           const T* means,
-                                           const T* vars,
-                                           const T* gamma,
-                                           T* gamma_grad,
-                                           T* betta_grad,
-                                           T* inp_grad,
-                                           int batch_size,
-                                           int hidden_dim,
-                                           hipStream_t stream[2]);
-
-template <typename T>
-void Transpose(const T* inp_mat, T* out_mat, int rows, int cols, hipStream_t stream);
-
-template <typename T>
-void launch_attn_softmax_backward(T* out_grad,
-                                  const T* soft_inp,
-                                  int batch_size,
-                                  int heads,
-                                  int seq_length,
-                                  hipStream_t stream);
-
-template <typename T>
-void launch_attn_softmax_backward_v2(T* out_grad,
-                                     const T* soft_inp,
-                                     int batch_size,
-                                     int heads,
-                                     int seq_length,
-                                     hipStream_t stream);
-
-// Custom softmax with scaling and attention mask addition
-template <typename T>
-void launch_attn_softmax(T* vals,
-                         const T* attn_mask,
-                         int batch_size,
-                         int heads,
-                         int sequence_length,
-                         hipStream_t stream);
-
-template <typename T>
-void launch_transform_0213(T* output,
-                           const T* vals,
-                           int batch_size,
-                           int seq_length,
-                           int hidden_dim,
-                           int heads,
-                           hipStream_t stream);
-
-// Custom bias add
-template <typename T>
-void launch_bias_add_transform_0213(T* outputs,
-                                    const T* vals,
-                                    const T* bias,
-                                    int batch_size,
-                                    int seq_length,
-                                    int hidden_dim,
-                                    int heads,
-                                    hipStream_t stream,
-                                    int trans_count);
-
-// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
-template <typename T>
-void launch_transform4d_0213(T* out,
-                             const T* in,
-                             int batch_size,
-                             int heads,
-                             int seq_length,
-                             int hidden_dim,
-                             hipStream_t stream,
-                             int trans_count);
-
-template <typename T>
-void launch_dropout(T* vals,
-                    const T* bias,
-                    uint8_t* mask,
-                    int batch,
-                    int dim,
-                    float ratio,
-                    hipStream_t stream);
-
-template <typename T>
-void launch_dropout(T* vals_out,
-                    const T* vals,
-                    uint8_t* mask,
-                    int total_count,
-                    int dim,
-                    float ratio,
-                    hipStream_t stream,
-                    bool bwd = false);
-
-template <typename T>
-void launch_dropout(T* out,
-                    const T* vals,
-                    const T* residual,
-                    const T* bias,
-                    uint8_t* mask,
-                    int batch,
-                    int dim,
-                    float ratio,
-                    hipStream_t stream);
-
-template <typename T>
-void launch_dropout_grad(T* vals, uint8_t* mask, int total_count, float ratio, hipStream_t stream);
-
-template <typename T>
-void launch_dropout_grad(T* vals_out,
-                         const T* vals,
-                         uint8_t* mask,
-                         int total_count,
-                         float ratio,
-                         hipStream_t stream);
-
-template <typename T>
-void launch_fuse_transpose_bias_kernel(const T* inp,
-                                       T* out,
-                                       int rows,
-                                       int cols,
-                                       hipStream_t stream);
-
-void launch_param_update(const float* input, __half* output, int size, hipStream_t stream);
-void launch_param_update_half(const float* input, __half* output, int size, hipStream_t stream);
diff --git a/deepspeed/ops/csrc/includes/dropout.h b/deepspeed/ops/csrc/includes/dropout.h
deleted file mode 100644
index a72572d0876e4ed30cb98270654fed8c201db6f0..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/dropout.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#pragma once
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-
-template <typename T>
-class Dropout {
-public:
-    struct Config {
-        float ratio;
-        uint32_t dim;
-        bool training;
-
-        Config(float r, uint32_t d) : ratio(r), dim(d), training(true) {}
-
-        float RATIO() const { return training ? ratio : 0.0; }
-        inline void SetDim(uint32_t d) { dim = d; }
-    };
-
-    Dropout(const Config& config) : _config(config), _mask(nullptr) {}
-
-    virtual ~Dropout() {}
-
-    void Forward(int bsz, T* out, const T* vals, cudaStream_t stream, bool bwd = false)
-    {
-        launch_dropout<T>(
-            out, vals, _mask, bsz * _config.dim, _config.dim, _config.RATIO(), stream, bwd);
-    }
-
-    void ForwardWithBias(int bsz, T* vals, const T* bias, cudaStream_t stream)
-    {
-        launch_dropout<T>(vals, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
-    }
-
-    void ForwardWithBias(int bsz,
-                         T* out,
-                         const T* vals,
-                         const T* residual,
-                         const T* bias,
-                         cudaStream_t stream)
-    {
-        launch_dropout<T>(
-            out, vals, residual, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
-    }
-
-    void Backward(int bsz, T* d_vals, cudaStream_t stream)
-    {
-        launch_dropout_grad<T>(d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
-    }
-
-    void Backward(int bsz, T* d_vals_out, const T* d_vals, cudaStream_t stream)
-    {
-        launch_dropout_grad<T>(
-            d_vals_out, d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
-    }
-
-    bool HasDropout() const { return _config.RATIO() > 0.0; }
-
-    void SetTrainingMode(bool training) { _config.training = training; }
-
-    void SetMask(uint8_t* mask)
-    {
-        if (!mask) { throw std::runtime_error("Dropout mask is null."); }
-
-        _mask = mask;
-    }
-
-    Config GetConfig() const { return _config; }
-
-    inline void SetDimension(uint32_t dim) { _config.SetDim(dim); }
-
-private:
-    uint8_t* _mask;
-    Config _config;
-};
diff --git a/deepspeed/ops/csrc/includes/dropout_hip.h b/deepspeed/ops/csrc/includes/dropout_hip.h
deleted file mode 100644
index 1bf352f9e7123b40da7b612692e07f4b1f2a783e..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/dropout_hip.h
+++ /dev/null
@@ -1,77 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <stdio.h>
-
-template <typename T>
-class Dropout {
-public:
-    struct Config {
-        float ratio;
-        uint32_t dim;
-        bool training;
-
-        Config(float r, uint32_t d) : ratio(r), dim(d), training(true) {}
-
-        float RATIO() const { return training ? ratio : 0.0; }
-        inline void SetDim(uint32_t d) { dim = d; }
-    };
-
-    Dropout(const Config& config) : _config(config), _mask(nullptr) {}
-
-    virtual ~Dropout() {}
-
-    void Forward(int bsz, T* out, const T* vals, hipStream_t stream, bool bwd = false)
-    {
-        launch_dropout<T>(
-            out, vals, _mask, bsz * _config.dim, _config.dim, _config.RATIO(), stream, bwd);
-    }
-
-    void ForwardWithBias(int bsz, T* vals, const T* bias, hipStream_t stream)
-    {
-        launch_dropout<T>(vals, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
-    }
-
-    void ForwardWithBias(int bsz,
-                         T* out,
-                         const T* vals,
-                         const T* residual,
-                         const T* bias,
-                         hipStream_t stream)
-    {
-        launch_dropout<T>(
-            out, vals, residual, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
-    }
-
-    void Backward(int bsz, T* d_vals, hipStream_t stream)
-    {
-        launch_dropout_grad<T>(d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
-    }
-
-    void Backward(int bsz, T* d_vals_out, const T* d_vals, hipStream_t stream)
-    {
-        launch_dropout_grad<T>(
-            d_vals_out, d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
-    }
-
-    bool HasDropout() const { return _config.RATIO() > 0.0; }
-
-    void SetTrainingMode(bool training) { _config.training = training; }
-
-    void SetMask(uint8_t* mask)
-    {
-        if (!mask) { throw std::runtime_error("Dropout mask is null."); }
-
-        _mask = mask;
-    }
-
-    Config GetConfig() const { return _config; }
-
-    inline void SetDimension(uint32_t dim) { _config.SetDim(dim); }
-
-private:
-    uint8_t* _mask;
-    Config _config;
-};
diff --git a/deepspeed/ops/csrc/includes/ds_transformer_cuda.h b/deepspeed/ops/csrc/includes/ds_transformer_cuda.h
deleted file mode 100644
index 09afeb9d4b1950f0fc01cad7e7963359b485970c..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/ds_transformer_cuda.h
+++ /dev/null
@@ -1,184 +0,0 @@
-#pragma once
-
-#include <cuda_runtime_api.h>
-#include <curand.h>
-#include <memory>
-#include <vector>
-#include "cublas_v2.h"
-#include "cuda.h"
-#include "dropout.h"
-#include "feed_forward.h"
-#include "gelu.h"
-#include "general_kernels.h"
-#include "normalize_layer.h"
-#include "softmax.h"
-#include "strided_batch_gemm.h"
-
-struct BertGemmAlgos {
-    int m_gemm_qkv_algo;
-    int m_gemm_inter_algo;
-    int m_gemm_output_algo;
-    int m_gemm_batch1_algo;
-    int m_gemm_batch2_algo;
-
-    BertGemmAlgos()
-        : m_gemm_qkv_algo(-1),
-          m_gemm_inter_algo(-1),
-          m_gemm_output_algo(-1),
-          m_gemm_batch1_algo(-1),
-          m_gemm_batch2_algo(-1)
-    {
-    }
-};
-
-template <typename T>
-class BertTransformerLayer {
-public:
-    BertTransformerLayer(unsigned layer_id,
-                         unsigned batch_size,
-                         unsigned hidden_size,
-                         unsigned num_heads,
-                         unsigned intermediate_size,
-                         unsigned seq_length,
-                         float attn_dropout_ratio,
-                         float hidden_output_dropout_ratio,
-                         float layer_norm_eps,
-                         bool pre_or_postLayerNorm,
-                         const std::vector<std::array<int, 3>>& gemm_algos,
-                         bool attn_dropout_checkpoint,
-                         bool normalize_invertible,
-                         bool gelu_checkpoint,
-                         bool stochastic_mode);
-
-    virtual ~BertTransformerLayer();
-
-    void Forward(unsigned bsz,
-                 const T* input_ptr,
-                 const T* input_mask_ptr,
-                 const T* attn_qkvw_ptr,
-                 const T* attn_qkvb_ptr,
-                 const T* attn_ow_ptr,
-                 const T* attn_ob_ptr,
-                 const T* attn_nw_ptr,
-                 const T* attn_nb_ptr,
-                 const T* inter_w_ptr,
-                 const T* inter_b_ptr,
-                 const T* output_w_ptr,
-                 const T* output_b_ptr,
-                 const T* norm_w_ptr,
-                 const T* norm_b_ptr,
-                 T* out_ptr,
-                 T* inp_norm_ptr,
-                 T* q_tf_ptr,
-                 T* k_tf_ptr,
-                 T* v_tf_ptr,
-                 T* softmax_output_ptr,
-                 T* ctx_bufB_ptr,
-                 T* attn_o_inp_ptr,
-                 T* add_res_ptr,
-                 T* ff1_inp_ptr,
-                 T* gelu_inp_ptr,
-                 T* ff2_inp_ptr);
-
-    void Backward(unsigned bsz,
-                  const T* grad_output_ptr,
-                  const T* input_ptr,
-                  const T* output_ptr,
-                  const T* inp_norm_ptr,
-                  const T* q_tf_ptr,
-                  const T* k_tf_ptr,
-                  const T* v_tf_ptr,
-                  const T* softmax_output_ptr,
-                  const T* ctx_bufB_ptr,
-                  const T* attn_o_inp_ptr,
-                  const T* add_res_ptr,
-                  const T* ff1_inp_ptr,
-                  const T* gelu_inp_ptr,
-                  const T* ff2_inp_ptr,
-                  const T* input_mask_ptr,
-                  const T* attn_qkvw_ptr,
-                  const T* attn_ow_ptr,
-                  const T* attn_nw_ptr,
-                  const T* attn_nb_ptr,
-                  const T* inter_w_ptr,
-                  const T* inter_b_ptr,
-                  const T* output_w_ptr,
-                  const T* norm_w_ptr,
-                  const T* norm_b_ptr,
-
-                  T* grad_input_ptr,
-                  T* grad_attn_qkvw_ptr,
-                  T* grad_attn_qkvb_ptr,
-                  T* grad_attn_ow_ptr,
-                  T* grad_attn_ob_ptr,
-                  T* grad_attn_nw_ptr,
-                  T* grad_attn_nb_ptr,
-                  T* grad_inter_w_ptr,
-                  T* grad_inter_b_ptr,
-                  T* grad_output_w_ptr,
-                  T* grad_output_b_ptr,
-                  T* grad_norm_w_ptr,
-                  T* grad_norm_b_ptr);
-
-    void SetIntermediateBuffers(uint8_t* attn_prob_dropout_mask_ptr,
-                                uint8_t* attn_output_dropout_mask_ptr,
-                                uint8_t* layer_output_dropout_mask_ptr,
-                                T* layer_norm_var,
-                                T* layer_norm_mean,
-                                T* attn_layer_norm_var,
-                                T* attn_layer_norm_mean);
-
-    inline unsigned GetBatchSize() const { return _batch_size; }
-    inline unsigned GetNumHeads() const { return _heads; }
-    inline unsigned GetSeqLength() const { return _seq_length; }
-    inline unsigned GetIntermediateSize() const { return _intermediate_size; }
-
-    void SetSeqLength(unsigned seq_len);
-    inline unsigned GetHiddenSize() const { return _hidden_size; }
-    void SetTrainingMode(bool training);
-    inline bool IsTrainingMode() const { return _training; }
-    inline bool GeluCheckpoint() const { return _gelu_checkpoint; }
-
-private:
-    void Initialize();
-    size_t getWorkspaceSize(int maxBatchSize) const;
-
-    // Params
-    unsigned _layer_id;
-    unsigned _batch_size;
-    unsigned _hidden_size;
-    unsigned _heads;
-    unsigned _size_per_head;
-    unsigned _intermediate_size;
-    unsigned _seq_length;
-
-    bool _pre_or_postLayerNorm;
-
-    cublasHandle_t _cublasHandle;
-    cudaStream_t _stream;
-
-    // layers
-    FeedForward<T> _qkv_linear;
-    FeedForward<T> _attn_out_linear;
-    Normalize_Layer<T> _attn_layer_norm;
-    Normalize_Layer<T> _layer_norm;
-    Normalize_Layer<T>* _last_normalize;
-    FeedForward<T> _ff1, _ff2;
-    Softmax<T> _softmax;
-    Gelu<T> _gelu;
-    Dropout<T> _attn_prob_dropout;
-    Dropout<T> _attn_output_dropout;
-    Dropout<T> _layer_output_dropout;
-    StridedBatchGemm<T> _attn_scores;
-    StridedBatchGemm<T> _attn_context;
-
-    bool _training;
-
-    // Memory saving flags
-    bool _attn_dropout_checkpoint;
-    bool _normalize_invertible;
-    bool _gelu_checkpoint;
-
-    // High Performance flags
-    bool _stochastic_mode;
-};
diff --git a/deepspeed/ops/csrc/includes/ds_transformer_hip.h b/deepspeed/ops/csrc/includes/ds_transformer_hip.h
deleted file mode 100644
index 502f2f38445cc0704d964b3c52e901eba09ce865..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/ds_transformer_hip.h
+++ /dev/null
@@ -1,185 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#include <hip/hip_runtime_api.h>
-#include <hiprand/hiprand.h>
-#include <memory>
-#include <vector>
-#include "rocblas.h"
-#include "hip/hip_runtime.h"
-#include "dropout_hip.h"
-#include "feed_forward_hip.h"
-#include "gelu_hip.h"
-#include "general_kernels_hip.h"
-#include "normalize_layer_hip.h"
-#include "softmax_hip.h"
-#include "strided_batch_gemm_hip.h"
-
-struct BertGemmAlgos {
-    int m_gemm_qkv_algo;
-    int m_gemm_inter_algo;
-    int m_gemm_output_algo;
-    int m_gemm_batch1_algo;
-    int m_gemm_batch2_algo;
-
-    BertGemmAlgos()
-        : m_gemm_qkv_algo(-1),
-          m_gemm_inter_algo(-1),
-          m_gemm_output_algo(-1),
-          m_gemm_batch1_algo(-1),
-          m_gemm_batch2_algo(-1)
-    {
-    }
-};
-
-template <typename T>
-class BertTransformerLayer {
-public:
-    BertTransformerLayer(unsigned layer_id,
-                         unsigned batch_size,
-                         unsigned hidden_size,
-                         unsigned num_heads,
-                         unsigned intermediate_size,
-                         unsigned seq_length,
-                         float attn_dropout_ratio,
-                         float hidden_output_dropout_ratio,
-                         float layer_norm_eps,
-                         bool pre_or_postLayerNorm,
-                         const std::vector<std::array<int, 3>>& gemm_algos,
-                         bool attn_dropout_checkpoint,
-                         bool normalize_invertible,
-                         bool gelu_checkpoint,
-                         bool stochastic_mode);
-
-    virtual ~BertTransformerLayer();
-
-    void Forward(unsigned bsz,
-                 const T* input_ptr,
-                 const T* input_mask_ptr,
-                 const T* attn_qkvw_ptr,
-                 const T* attn_qkvb_ptr,
-                 const T* attn_ow_ptr,
-                 const T* attn_ob_ptr,
-                 const T* attn_nw_ptr,
-                 const T* attn_nb_ptr,
-                 const T* inter_w_ptr,
-                 const T* inter_b_ptr,
-                 const T* output_w_ptr,
-                 const T* output_b_ptr,
-                 const T* norm_w_ptr,
-                 const T* norm_b_ptr,
-                 T* out_ptr,
-                 T* inp_norm_ptr,
-                 T* q_tf_ptr,
-                 T* k_tf_ptr,
-                 T* v_tf_ptr,
-                 T* softmax_output_ptr,
-                 T* ctx_bufB_ptr,
-                 T* attn_o_inp_ptr,
-                 T* add_res_ptr,
-                 T* ff1_inp_ptr,
-                 T* gelu_inp_ptr,
-                 T* ff2_inp_ptr);
-
-    void Backward(unsigned bsz,
-                  const T* grad_output_ptr,
-                  const T* input_ptr,
-                  const T* output_ptr,
-                  const T* inp_norm_ptr,
-                  const T* q_tf_ptr,
-                  const T* k_tf_ptr,
-                  const T* v_tf_ptr,
-                  const T* softmax_output_ptr,
-                  const T* ctx_bufB_ptr,
-                  const T* attn_o_inp_ptr,
-                  const T* add_res_ptr,
-                  const T* ff1_inp_ptr,
-                  const T* gelu_inp_ptr,
-                  const T* ff2_inp_ptr,
-                  const T* input_mask_ptr,
-                  const T* attn_qkvw_ptr,
-                  const T* attn_ow_ptr,
-                  const T* attn_nw_ptr,
-                  const T* attn_nb_ptr,
-                  const T* inter_w_ptr,
-                  const T* inter_b_ptr,
-                  const T* output_w_ptr,
-                  const T* norm_w_ptr,
-                  const T* norm_b_ptr,
-
-                  T* grad_input_ptr,
-                  T* grad_attn_qkvw_ptr,
-                  T* grad_attn_qkvb_ptr,
-                  T* grad_attn_ow_ptr,
-                  T* grad_attn_ob_ptr,
-                  T* grad_attn_nw_ptr,
-                  T* grad_attn_nb_ptr,
-                  T* grad_inter_w_ptr,
-                  T* grad_inter_b_ptr,
-                  T* grad_output_w_ptr,
-                  T* grad_output_b_ptr,
-                  T* grad_norm_w_ptr,
-                  T* grad_norm_b_ptr);
-
-    void SetIntermediateBuffers(uint8_t* attn_prob_dropout_mask_ptr,
-                                uint8_t* attn_output_dropout_mask_ptr,
-                                uint8_t* layer_output_dropout_mask_ptr,
-                                T* layer_norm_var,
-                                T* layer_norm_mean,
-                                T* attn_layer_norm_var,
-                                T* attn_layer_norm_mean);
-
-    inline unsigned GetBatchSize() const { return _batch_size; }
-    inline unsigned GetNumHeads() const { return _heads; }
-    inline unsigned GetSeqLength() const { return _seq_length; }
-    inline unsigned GetIntermediateSize() const { return _intermediate_size; }
-
-    void SetSeqLength(unsigned seq_len);
-    inline unsigned GetHiddenSize() const { return _hidden_size; }
-    void SetTrainingMode(bool training);
-    inline bool IsTrainingMode() const { return _training; }
-    inline bool GeluCheckpoint() const { return _gelu_checkpoint; }
-
-private:
-    void Initialize();
-    size_t getWorkspaceSize(int maxBatchSize) const;
-
-    // Params
-    unsigned _layer_id;
-    unsigned _batch_size;
-    unsigned _hidden_size;
-    unsigned _heads;
-    unsigned _size_per_head;
-    unsigned _intermediate_size;
-    unsigned _seq_length;
-
-    bool _pre_or_postLayerNorm;
-
-    rocblas_handle _cublasHandle;
-    hipStream_t _stream;
-
-    // layers
-    FeedForward<T> _qkv_linear;
-    FeedForward<T> _attn_out_linear;
-    Normalize_Layer<T> _attn_layer_norm;
-    Normalize_Layer<T> _layer_norm;
-    Normalize_Layer<T>* _last_normalize;
-    FeedForward<T> _ff1, _ff2;
-    Softmax<T> _softmax;
-    Gelu<T> _gelu;
-    Dropout<T> _attn_prob_dropout;
-    Dropout<T> _attn_output_dropout;
-    Dropout<T> _layer_output_dropout;
-    StridedBatchGemm<T> _attn_scores;
-    StridedBatchGemm<T> _attn_context;
-
-    bool _training;
-
-    // Memory saving flags
-    bool _attn_dropout_checkpoint;
-    bool _normalize_invertible;
-    bool _gelu_checkpoint;
-
-    // High Performance flags
-    bool _stochastic_mode;
-};
diff --git a/deepspeed/ops/csrc/includes/feed_forward.h b/deepspeed/ops/csrc/includes/feed_forward.h
deleted file mode 100644
index de7a9cf1bf9eaf686f387e4dd1b3a45b02f28e85..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/feed_forward.h
+++ /dev/null
@@ -1,105 +0,0 @@
-#ifndef __FEEDFORWARD_H__
-#define __FEEDFORWARD_H__
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include "custom_cuda_layers.h"
-
-template <typename T>
-class FeedForward {
-public:
-    struct Config {
-        int batchSize, outputSize;
-        int inputSize;
-        std::array<int, 3> gemm_algos;
-        Config(int batch, int outputs, int inputs, const std::array<int, 3>& algos)
-            : batchSize(batch), outputSize(outputs), inputSize(inputs), gemm_algos(algos)
-        {
-        }
-    };
-
-    FeedForward(Config config) : config_(config) {}
-
-    ~FeedForward() {}
-
-    void Forward(int bsz,
-                 const T* input_ptr,
-                 const T* weights,
-                 T* out,
-                 cublasHandle_t& _cublasHandle)
-    {
-        float alpha = T(1.);
-        float beta = T(0.);
-
-        cublas_gemm_ex(_cublasHandle,
-                       CUBLAS_OP_T,
-                       CUBLAS_OP_N,
-                       config_.outputSize,
-                       bsz,
-                       config_.inputSize,
-                       &alpha,
-                       &beta,
-                       weights,
-                       input_ptr,
-                       out,
-#ifdef __HIP_PLATFORM_HCC__
-                       rocblas_gemm_algo(config_.gemm_algos[0]));
-#else
-                       cublasGemmAlgo_t(config_.gemm_algos[0]));
-#endif
-    }
-    void Backward(int bsz,
-                  const T* out_grad,
-                  const T* input_ptr,
-                  const T* weights,
-                  T* weights_grad,
-                  T* bias_grad,
-                  cublasHandle_t& _cublasHandle,
-                  cudaStream_t& stream,
-                  T* inp_grad_out = nullptr,
-                  T* out_grad_trans_out = nullptr)
-    {
-        float alpha = (T)1.0, beta = (T)0.0;
-        cublas_gemm_ex(_cublasHandle,
-                       CUBLAS_OP_N,
-                       CUBLAS_OP_T,
-                       config_.inputSize,
-                       config_.outputSize,
-                       bsz,
-                       &alpha,
-                       &beta,
-                       input_ptr,
-                       out_grad,
-                       weights_grad,
-#ifdef __HIP_PLATFORM_HCC__
-                       rocblas_gemm_algo(config_.gemm_algos[1]));
-#else
-                       cublasGemmAlgo_t(config_.gemm_algos[1]));
-#endif
-
-        cublas_gemm_ex(_cublasHandle,
-                       CUBLAS_OP_N,
-                       CUBLAS_OP_N,
-                       config_.inputSize,
-                       bsz,
-                       config_.outputSize,
-                       &alpha,
-                       &beta,
-                       weights,
-                       out_grad,
-                       inp_grad_out,
-#ifdef __HIP_PLATFORM_HCC__
-                       rocblas_gemm_algo(config_.gemm_algos[2]));
-#else
-                       cublasGemmAlgo_t(config_.gemm_algos[2]));
-#endif
-
-        launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream);
-    }
-
-private:
-    Config config_;
-};
-
-#endif
diff --git a/deepspeed/ops/csrc/includes/feed_forward_hip.h b/deepspeed/ops/csrc/includes/feed_forward_hip.h
deleted file mode 100644
index e7e0600803f7ebe71f352c0300222c17e6f6365b..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/feed_forward_hip.h
+++ /dev/null
@@ -1,106 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#ifndef __FEEDFORWARD_H__
-#define __FEEDFORWARD_H__
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <stdio.h>
-#include "custom_hip_layers.h"
-
-template <typename T>
-class FeedForward {
-public:
-    struct Config {
-        int batchSize, outputSize;
-        int inputSize;
-        std::array<int, 3> gemm_algos;
-        Config(int batch, int outputs, int inputs, const std::array<int, 3>& algos)
-            : batchSize(batch), outputSize(outputs), inputSize(inputs), gemm_algos(algos)
-        {
-        }
-    };
-
-    FeedForward(Config config) : config_(config) {}
-
-    ~FeedForward() {}
-
-    void Forward(int bsz,
-                 const T* input_ptr,
-                 const T* weights,
-                 T* out,
-                 rocblas_handle& _cublasHandle)
-    {
-        float alpha = T(1.);
-        float beta = T(0.);
-
-        cublas_gemm_ex(_cublasHandle,
-                       rocblas_operation_transpose,
-                       rocblas_operation_none,
-                       config_.outputSize,
-                       bsz,
-                       config_.inputSize,
-                       &alpha,
-                       &beta,
-                       weights,
-                       input_ptr,
-                       out,
-#ifdef __HIP_PLATFORM_HCC__
-                       rocblas_gemm_algo(config_.gemm_algos[0]));
-#else
-                       cublasGemmAlgo_t(config_.gemm_algos[0]));
-#endif
-    }
-    void Backward(int bsz,
-                  const T* out_grad,
-                  const T* input_ptr,
-                  const T* weights,
-                  T* weights_grad,
-                  T* bias_grad,
-                  rocblas_handle& _cublasHandle,
-                  hipStream_t& stream,
-                  T* inp_grad_out = nullptr,
-                  T* out_grad_trans_out = nullptr)
-    {
-        float alpha = (T)1.0, beta = (T)0.0;
-        cublas_gemm_ex(_cublasHandle,
-                       rocblas_operation_none,
-                       rocblas_operation_transpose,
-                       config_.inputSize,
-                       config_.outputSize,
-                       bsz,
-                       &alpha,
-                       &beta,
-                       input_ptr,
-                       out_grad,
-                       weights_grad,
-#ifdef __HIP_PLATFORM_HCC__
-                       rocblas_gemm_algo(config_.gemm_algos[1]));
-#else
-                       cublasGemmAlgo_t(config_.gemm_algos[1]));
-#endif
-
-        cublas_gemm_ex(_cublasHandle,
-                       rocblas_operation_none,
-                       rocblas_operation_none,
-                       config_.inputSize,
-                       bsz,
-                       config_.outputSize,
-                       &alpha,
-                       &beta,
-                       weights,
-                       out_grad,
-                       inp_grad_out,
-#ifdef __HIP_PLATFORM_HCC__
-                       rocblas_gemm_algo(config_.gemm_algos[2]));
-#else
-                       cublasGemmAlgo_t(config_.gemm_algos[2]));
-#endif
-
-        launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream);
-    }
-
-private:
-    Config config_;
-};
-
-#endif
diff --git a/deepspeed/ops/csrc/includes/gelu.h b/deepspeed/ops/csrc/includes/gelu.h
deleted file mode 100644
index 560f4140ed61e9455b78911da0a44c8944ce53ed..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/gelu.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#pragma once
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include "custom_cuda_layers.h"
-
-template <typename T>
-class Gelu {
-public:
-    struct Config {
-        uint32_t intermediate_size;
-        Config(uint32_t inter_size) : intermediate_size(inter_size) {}
-    };
-
-    Gelu(const Config& config) : _config(config) {}
-
-    virtual ~Gelu() {}
-
-    void ForwardWithBiasAdd(int bsz,
-                            const T* input_buf,
-                            const T* bias,
-                            T* output,
-                            cudaStream_t stream)
-    {
-        launch_bias_gelu<T>(input_buf, bias, output, _config.intermediate_size, bsz, stream);
-    }
-
-    void Backward(int bsz, T* d_output, const T* input_buf, const T* bias, cudaStream_t stream)
-    {
-        launch_d_gelu<T>(d_output, input_buf, bias, _config.intermediate_size, bsz, stream);
-    }
-
-private:
-    Config _config;
-};
diff --git a/deepspeed/ops/csrc/includes/gelu_hip.h b/deepspeed/ops/csrc/includes/gelu_hip.h
deleted file mode 100644
index 0297b66f394ec3b60c96c0453cf9ba6258296c1b..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/gelu_hip.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <stdio.h>
-#include "custom_hip_layers.h"
-
-template <typename T>
-class Gelu {
-public:
-    struct Config {
-        uint32_t intermediate_size;
-        Config(uint32_t inter_size) : intermediate_size(inter_size) {}
-    };
-
-    Gelu(const Config& config) : _config(config) {}
-
-    virtual ~Gelu() {}
-
-    void ForwardWithBiasAdd(int bsz,
-                            const T* input_buf,
-                            const T* bias,
-                            T* output,
-                            hipStream_t stream)
-    {
-        launch_bias_gelu<T>(input_buf, bias, output, _config.intermediate_size, bsz, stream);
-    }
-
-    void Backward(int bsz, T* d_output, const T* input_buf, const T* bias, hipStream_t stream)
-    {
-        launch_d_gelu<T>(d_output, input_buf, bias, _config.intermediate_size, bsz, stream);
-    }
-
-private:
-    Config _config;
-};
diff --git a/deepspeed/ops/csrc/includes/gemm_test.h b/deepspeed/ops/csrc/includes/gemm_test.h
deleted file mode 100644
index 22c35123f2c776e2e87d53310c316497e55d214d..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/gemm_test.h
+++ /dev/null
@@ -1,327 +0,0 @@
-
-#pragma once
-
-#include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
-#include <cuda_profiler_api.h>
-#endif
-#include <array>
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-#include <limits>
-#include <memory>
-#include "StopWatch.h"
-#include "cublas_wrappers.h"
-
-template <typename T>
-void check(T result, char const* const func, const char* const file, int const line)
-{
-    if (result) {
-        std::cout << (std::string("CUDA runtime error: ") + +file + ":" + std::to_string(line) +
-                      " \n");
-    }
-}
-
-#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
-
-template <typename T>
-class GemmTest {
-public:
-    GemmTest(int m, int n, int k, cublasOperation_t ta, cublasOperation_t tb, cublasHandle_t h)
-        : M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
-    {
-        check_cuda_error(cudaMalloc((void**)&A, sizeof(T) * M * K));
-        check_cuda_error(cudaMalloc((void**)&B, sizeof(T) * K * N));
-        check_cuda_error(cudaMalloc((void**)&C, sizeof(T) * M * N));
-    }
-
-    ~GemmTest()
-    {
-        check_cuda_error(cudaFree(A));
-        check_cuda_error(cudaFree(B));
-        check_cuda_error(cudaFree(C));
-    }
-
-    std::array<int, 3> TestAlgo(int loops)
-    {
-        float alpha = (T)1.0f;
-        float beta = (T)0.0f;
-
-        int algo_fw = Run(loops, [=](int algo) {
-            cublas_gemm_ex(handle,
-                           CUBLAS_OP_T,
-                           CUBLAS_OP_N,
-                           N,
-                           M,
-                           K,
-                           &alpha,
-                           &beta,
-                           B,
-                           A,
-                           C,
-#ifdef __HIP_PLATFORM_HCC__
-                           static_cast<rocblas_gemm_algo>(algo));
-#else
-                           static_cast<cublasGemmAlgo_t>(algo));
-#endif
-        });
-
-        int algo_bw1 = Run(loops, [=](int algo) {
-            cublas_gemm_ex(handle,
-                           CUBLAS_OP_N,
-                           CUBLAS_OP_T,
-                           K,
-                           N,
-                           M,
-                           &alpha,
-                           &beta,
-                           A,
-                           C,
-                           B,
-#ifdef __HIP_PLATFORM_HCC__
-                           static_cast<rocblas_gemm_algo>(algo));
-#else
-                           static_cast<cublasGemmAlgo_t>(algo));
-#endif
-        });
-
-        int algo_bw2 = Run(loops, [=](int algo) {
-            cublas_gemm_ex(handle,
-                           CUBLAS_OP_N,
-                           CUBLAS_OP_N,
-                           K,
-                           M,
-                           N,
-                           &alpha,
-                           &beta,
-                           B,
-                           C,
-                           A,
-#ifdef __HIP_PLATFORM_HCC__
-                           static_cast<rocblas_gemm_algo>(algo));
-#else
-                           static_cast<cublasGemmAlgo_t>(algo));
-#endif
-        });
-
-        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
-    }
-
-    template <typename Func>
-    int Run(int loops, Func f)
-    {
-        float fast_latency = (std::numeric_limits<float>::max)();
-        int fast_algo = 0;
-
-#ifdef __HIP_PLATFORM_HCC__
-        for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
-#else
-        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-#endif
-             algo++) {
-            int warm_up = 5;
-            for (int i = 0; i < warm_up; ++i) f(algo);
-
-            cudaDeviceSynchronize();
-            Stopwatch timer;
-            timer.Restart();
-
-            for (int i = 0; i < loops; ++i) f(algo);
-
-            cudaDeviceSynchronize();
-            timer.Stop();
-
-            float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
-
-            printf("algo-%d: %.3fms\n", algo, avg_latency);
-
-            if (avg_latency < fast_latency) {
-                fast_latency = avg_latency;
-                fast_algo = algo;
-            }
-        }
-
-        printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
-
-        return fast_algo;
-    }
-
-private:
-    int M, N, K;
-    cublasHandle_t handle;
-    cublasOperation_t transa, transb;
-    T *A, *B, *C;
-};
-
-template <typename T>
-class StridedGemmTest {
-public:
-    StridedGemmTest(int b,
-                    int m,
-                    int n,
-                    int k,
-                    cublasOperation_t ta,
-                    cublasOperation_t tb,
-                    cublasHandle_t h)
-        : bsz(b), M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
-    {
-        check_cuda_error(cudaMalloc((void**)&A, sizeof(T) * M * K * bsz));
-        check_cuda_error(cudaMalloc((void**)&B, sizeof(T) * K * N * bsz));
-        check_cuda_error(cudaMalloc((void**)&C, sizeof(T) * M * N * bsz));
-    }
-
-    ~StridedGemmTest()
-    {
-        check_cuda_error(cudaFree(A));
-        check_cuda_error(cudaFree(B));
-        check_cuda_error(cudaFree(C));
-    }
-
-    std::array<int, 3> TestAlgo(int loops)
-    {
-        float alpha = (T)1.0f;
-        float beta = (T)0.0f;
-
-        int algo_fw = Run(loops, [=](int algo) {
-            int stride_a = M * K;
-            int stride_b = N * K;
-            int stride_c = M * N;
-
-            cublas_strided_batched_gemm(handle,
-                                        M,
-                                        N,
-                                        K,
-                                        &alpha,
-                                        &beta,
-                                        A,
-                                        B,
-                                        C,
-                                        transa,
-                                        transb,
-                                        stride_a,
-                                        stride_b,
-                                        stride_c,
-                                        bsz,
-#ifdef __HIP_PLATFORM_HCC__
-                                        static_cast<rocblas_gemm_algo>(algo));
-#else
-                                        static_cast<cublasGemmAlgo_t>(algo));
-#endif
-        });
-
-        int algo_bw1 = Run(loops, [=](int algo) {
-            int mb = (transa == CUBLAS_OP_T ? K : M);
-            int kb = (transa == CUBLAS_OP_T ? M : K);
-
-            int stride_a = mb * N;
-            int stride_b = N * kb;
-            int stride_c = M * K;
-
-            // B need to transpose.
-            cublasOperation_t op_b = (transb == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
-
-            // Calculate d_A.
-            cublas_strided_batched_gemm(handle,
-                                        mb,
-                                        kb,
-                                        N,
-                                        &alpha,
-                                        &beta,
-                                        (transa == CUBLAS_OP_T ? B : C),
-                                        (transa == CUBLAS_OP_T ? C : B),
-                                        A,
-                                        CUBLAS_OP_N,
-                                        op_b,
-                                        stride_a,
-                                        stride_b,
-                                        stride_c,
-                                        bsz,
-#ifdef __HIP_PLATFORM_HCC__
-                                        static_cast<rocblas_gemm_algo>(algo));
-#else
-                                        static_cast<cublasGemmAlgo_t>(algo));
-#endif
-        });
-
-        int algo_bw2 = Run(loops, [=](int algo) {
-            // A need to transpose.
-            cublasOperation_t op_a = (transa == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
-
-            int stride_a = M * K;
-            int stride_b = M * N;
-            int stride_c = N * K;
-
-            // Calculate d_B.
-            cublas_strided_batched_gemm(handle,
-                                        K,
-                                        N,
-                                        M,
-                                        &alpha,
-                                        &beta,
-                                        A,
-                                        C,
-                                        B,
-                                        op_a,
-                                        CUBLAS_OP_N,
-                                        stride_a,
-                                        stride_b,
-                                        stride_c,
-                                        bsz,
-#ifdef __HIP_PLATFORM_HCC__
-                                        static_cast<rocblas_gemm_algo>(algo));
-#else
-                                        static_cast<cublasGemmAlgo_t>(algo));
-#endif
-        });
-
-        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
-    }
-
-    template <typename Func>
-    int Run(int loops, Func f)
-    {
-        float fast_latency = (std::numeric_limits<float>::max)();
-        int fast_algo = 0;
-
-#ifdef __HIP_PLATFORM_HCC__
-        for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
-#else
-        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-#endif
-             algo++) {
-            int warm_up = 5;
-            for (int i = 0; i < warm_up; ++i) f(algo);
-
-            cudaDeviceSynchronize();
-            Stopwatch timer;
-            timer.Restart();
-
-            for (int i = 0; i < loops; ++i) f(algo);
-
-            cudaDeviceSynchronize();
-            timer.Stop();
-
-            float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
-
-            printf("algo-%d: %.3fms\n", algo, avg_latency);
-
-            if (avg_latency < fast_latency) {
-                fast_latency = avg_latency;
-                fast_algo = algo;
-            }
-        }
-
-        printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
-
-        return fast_algo;
-    }
-
-private:
-    int bsz, M, N, K;
-    cublasHandle_t handle;
-    cublasOperation_t transa, transb;
-    T *A, *B, *C;
-};
diff --git a/deepspeed/ops/csrc/includes/gemm_test_hip.h b/deepspeed/ops/csrc/includes/gemm_test_hip.h
deleted file mode 100644
index 117302ddb4a3b250512a04115d5ee856771928c9..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/gemm_test_hip.h
+++ /dev/null
@@ -1,328 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-
-#pragma once
-
-#include <hip/hip_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
-#include <cuda_profiler_api.h>
-#endif
-#include <array>
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-#include <limits>
-#include <memory>
-#include "StopWatch.h"
-#include "cublas_wrappers_hip.h"
-
-template <typename T>
-void check(T result, char const* const func, const char* const file, int const line)
-{
-    if (result) {
-        std::cout << (std::string("CUDA runtime error: ") + +file + ":" + std::to_string(line) +
-                      " \n");
-    }
-}
-
-#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
-
-template <typename T>
-class GemmTest {
-public:
-    GemmTest(int m, int n, int k, rocblas_operation ta, rocblas_operation tb, rocblas_handle h)
-        : M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
-    {
-        check_cuda_error(hipMalloc((void**)&A, sizeof(T) * M * K));
-        check_cuda_error(hipMalloc((void**)&B, sizeof(T) * K * N));
-        check_cuda_error(hipMalloc((void**)&C, sizeof(T) * M * N));
-    }
-
-    ~GemmTest()
-    {
-        check_cuda_error(hipFree(A));
-        check_cuda_error(hipFree(B));
-        check_cuda_error(hipFree(C));
-    }
-
-    std::array<int, 3> TestAlgo(int loops)
-    {
-        float alpha = (T)1.0f;
-        float beta = (T)0.0f;
-
-        int algo_fw = Run(loops, [=](int algo) {
-            cublas_gemm_ex(handle,
-                           rocblas_operation_transpose,
-                           rocblas_operation_none,
-                           N,
-                           M,
-                           K,
-                           &alpha,
-                           &beta,
-                           B,
-                           A,
-                           C,
-#ifdef __HIP_PLATFORM_HCC__
-                           static_cast<rocblas_gemm_algo>(algo));
-#else
-                           static_cast<cublasGemmAlgo_t>(algo));
-#endif
-        });
-
-        int algo_bw1 = Run(loops, [=](int algo) {
-            cublas_gemm_ex(handle,
-                           rocblas_operation_none,
-                           rocblas_operation_transpose,
-                           K,
-                           N,
-                           M,
-                           &alpha,
-                           &beta,
-                           A,
-                           C,
-                           B,
-#ifdef __HIP_PLATFORM_HCC__
-                           static_cast<rocblas_gemm_algo>(algo));
-#else
-                           static_cast<cublasGemmAlgo_t>(algo));
-#endif
-        });
-
-        int algo_bw2 = Run(loops, [=](int algo) {
-            cublas_gemm_ex(handle,
-                           rocblas_operation_none,
-                           rocblas_operation_none,
-                           K,
-                           M,
-                           N,
-                           &alpha,
-                           &beta,
-                           B,
-                           C,
-                           A,
-#ifdef __HIP_PLATFORM_HCC__
-                           static_cast<rocblas_gemm_algo>(algo));
-#else
-                           static_cast<cublasGemmAlgo_t>(algo));
-#endif
-        });
-
-        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
-    }
-
-    template <typename Func>
-    int Run(int loops, Func f)
-    {
-        float fast_latency = (std::numeric_limits<float>::max)();
-        int fast_algo = 0;
-
-#ifdef __HIP_PLATFORM_HCC__
-        for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
-#else
-        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-#endif
-             algo++) {
-            int warm_up = 5;
-            for (int i = 0; i < warm_up; ++i) f(algo);
-
-            hipDeviceSynchronize();
-            Stopwatch timer;
-            timer.Restart();
-
-            for (int i = 0; i < loops; ++i) f(algo);
-
-            hipDeviceSynchronize();
-            timer.Stop();
-
-            float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
-
-            printf("algo-%d: %.3fms\n", algo, avg_latency);
-
-            if (avg_latency < fast_latency) {
-                fast_latency = avg_latency;
-                fast_algo = algo;
-            }
-        }
-
-        printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
-
-        return fast_algo;
-    }
-
-private:
-    int M, N, K;
-    rocblas_handle handle;
-    rocblas_operation transa, transb;
-    T *A, *B, *C;
-};
-
-template <typename T>
-class StridedGemmTest {
-public:
-    StridedGemmTest(int b,
-                    int m,
-                    int n,
-                    int k,
-                    rocblas_operation ta,
-                    rocblas_operation tb,
-                    rocblas_handle h)
-        : bsz(b), M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
-    {
-        check_cuda_error(hipMalloc((void**)&A, sizeof(T) * M * K * bsz));
-        check_cuda_error(hipMalloc((void**)&B, sizeof(T) * K * N * bsz));
-        check_cuda_error(hipMalloc((void**)&C, sizeof(T) * M * N * bsz));
-    }
-
-    ~StridedGemmTest()
-    {
-        check_cuda_error(hipFree(A));
-        check_cuda_error(hipFree(B));
-        check_cuda_error(hipFree(C));
-    }
-
-    std::array<int, 3> TestAlgo(int loops)
-    {
-        float alpha = (T)1.0f;
-        float beta = (T)0.0f;
-
-        int algo_fw = Run(loops, [=](int algo) {
-            int stride_a = M * K;
-            int stride_b = N * K;
-            int stride_c = M * N;
-
-            cublas_strided_batched_gemm(handle,
-                                        M,
-                                        N,
-                                        K,
-                                        &alpha,
-                                        &beta,
-                                        A,
-                                        B,
-                                        C,
-                                        transa,
-                                        transb,
-                                        stride_a,
-                                        stride_b,
-                                        stride_c,
-                                        bsz,
-#ifdef __HIP_PLATFORM_HCC__
-                                        static_cast<rocblas_gemm_algo>(algo));
-#else
-                                        static_cast<cublasGemmAlgo_t>(algo));
-#endif
-        });
-
-        int algo_bw1 = Run(loops, [=](int algo) {
-            int mb = (transa == rocblas_operation_transpose ? K : M);
-            int kb = (transa == rocblas_operation_transpose ? M : K);
-
-            int stride_a = mb * N;
-            int stride_b = N * kb;
-            int stride_c = M * K;
-
-            // B need to transpose.
-            rocblas_operation op_b = (transb == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
-
-            // Calculate d_A.
-            cublas_strided_batched_gemm(handle,
-                                        mb,
-                                        kb,
-                                        N,
-                                        &alpha,
-                                        &beta,
-                                        (transa == rocblas_operation_transpose ? B : C),
-                                        (transa == rocblas_operation_transpose ? C : B),
-                                        A,
-                                        rocblas_operation_none,
-                                        op_b,
-                                        stride_a,
-                                        stride_b,
-                                        stride_c,
-                                        bsz,
-#ifdef __HIP_PLATFORM_HCC__
-                                        static_cast<rocblas_gemm_algo>(algo));
-#else
-                                        static_cast<cublasGemmAlgo_t>(algo));
-#endif
-        });
-
-        int algo_bw2 = Run(loops, [=](int algo) {
-            // A need to transpose.
-            rocblas_operation op_a = (transa == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
-
-            int stride_a = M * K;
-            int stride_b = M * N;
-            int stride_c = N * K;
-
-            // Calculate d_B.
-            cublas_strided_batched_gemm(handle,
-                                        K,
-                                        N,
-                                        M,
-                                        &alpha,
-                                        &beta,
-                                        A,
-                                        C,
-                                        B,
-                                        op_a,
-                                        rocblas_operation_none,
-                                        stride_a,
-                                        stride_b,
-                                        stride_c,
-                                        bsz,
-#ifdef __HIP_PLATFORM_HCC__
-                                        static_cast<rocblas_gemm_algo>(algo));
-#else
-                                        static_cast<cublasGemmAlgo_t>(algo));
-#endif
-        });
-
-        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
-    }
-
-    template <typename Func>
-    int Run(int loops, Func f)
-    {
-        float fast_latency = (std::numeric_limits<float>::max)();
-        int fast_algo = 0;
-
-#ifdef __HIP_PLATFORM_HCC__
-        for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
-#else
-        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-#endif
-             algo++) {
-            int warm_up = 5;
-            for (int i = 0; i < warm_up; ++i) f(algo);
-
-            hipDeviceSynchronize();
-            Stopwatch timer;
-            timer.Restart();
-
-            for (int i = 0; i < loops; ++i) f(algo);
-
-            hipDeviceSynchronize();
-            timer.Stop();
-
-            float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
-
-            printf("algo-%d: %.3fms\n", algo, avg_latency);
-
-            if (avg_latency < fast_latency) {
-                fast_latency = avg_latency;
-                fast_algo = algo;
-            }
-        }
-
-        printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
-
-        return fast_algo;
-    }
-
-private:
-    int bsz, M, N, K;
-    rocblas_handle handle;
-    rocblas_operation transa, transb;
-    T *A, *B, *C;
-};
diff --git a/deepspeed/ops/csrc/includes/general_kernels.h b/deepspeed/ops/csrc/includes/general_kernels.h
deleted file mode 100644
index e949309483ce8060c86ad1a46ca9264b1f45f810..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/general_kernels.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#ifdef __HIP_PLATFORM_HCC__
-#include <hip/hip_cooperative_groups.h>
-#else
-#include <cooperative_groups.h>
-#endif
-#include <curand_kernel.h>
-
-#include "context.h"
-#include "cublas_wrappers.h"
-
-#define THREADS 256
-#define TILE_DIM 32
-
-#define minus_infinity -1 * std::numeric_limits<float>::infinity()
-
-#define FINAL_MASK 0xffffffff
-
-template <typename T>
-void launch_fused_add2(T* out,
-                       const T* inp1,
-                       const T* inp2,
-                       int batch_size,
-                       int seq_length,
-                       int hidden_size,
-                       cudaStream_t& stream);
-
-template <typename T>
-void launch_fused_add4(T* out,
-                       const T* inp1,
-                       const T* inp2,
-                       const T* inp3,
-                       const T* inp4,
-                       int batch_size,
-                       int seq_length,
-                       int hidden_size,
-                       cudaStream_t& stream);
-
-template <typename T>
-void launch_fused_add3(T* out,
-                       const T* inp1,
-                       const T* inp2,
-                       const T* inp3,
-                       int batch_size,
-                       int seq_length,
-                       int hidden_size,
-                       cudaStream_t& stream);
diff --git a/deepspeed/ops/csrc/includes/general_kernels_hip.h b/deepspeed/ops/csrc/includes/general_kernels_hip.h
deleted file mode 100644
index 2cafd95b2a1ba34f398b65f132f27846b57ac751..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/general_kernels_hip.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#ifdef __HIP_PLATFORM_HCC__
-#include <hip/hip_cooperative_groups.h>
-#else
-#include <cooperative_groups.h>
-#endif
-#include <hiprand/hiprand_kernel.h>
-
-#include "context_hip.h"
-#include "cublas_wrappers_hip.h"
-
-#define THREADS 256
-#define TILE_DIM 32
-
-#define minus_infinity -1 * std::numeric_limits<float>::infinity()
-
-#define FINAL_MASK 0xffffffff
-
-template <typename T>
-void launch_fused_add2(T* out,
-                       const T* inp1,
-                       const T* inp2,
-                       int batch_size,
-                       int seq_length,
-                       int hidden_size,
-                       hipStream_t& stream);
-
-template <typename T>
-void launch_fused_add4(T* out,
-                       const T* inp1,
-                       const T* inp2,
-                       const T* inp3,
-                       const T* inp4,
-                       int batch_size,
-                       int seq_length,
-                       int hidden_size,
-                       hipStream_t& stream);
-
-template <typename T>
-void launch_fused_add3(T* out,
-                       const T* inp1,
-                       const T* inp2,
-                       const T* inp3,
-                       int batch_size,
-                       int seq_length,
-                       int hidden_size,
-                       hipStream_t& stream);
diff --git a/deepspeed/ops/csrc/includes/normalize_layer.h b/deepspeed/ops/csrc/includes/normalize_layer.h
deleted file mode 100644
index b4d135ec495e61b041c0610deb966fb0468ca4a6..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/normalize_layer.h
+++ /dev/null
@@ -1,202 +0,0 @@
-#pragma once
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include <fstream>
-#include "custom_cuda_layers.h"
-
-using namespace std;
-
-template <typename T>
-class Normalize_Layer {
-public:
-    struct Config {
-        uint32_t batchSize;
-        uint32_t seqLength;
-        uint32_t hiddenDim;
-        float epsilon;
-        bool training;
-        bool useMean;
-        Config(uint32_t batch,
-               uint32_t seq,
-               uint32_t h,
-               float epsilon = 1e-12,
-               bool training = true,
-               bool useMean = true)
-            : batchSize(batch),
-              seqLength(seq),
-              hiddenDim(h),
-              epsilon(epsilon),
-              training(training),
-              useMean(useMean)
-        {
-        }
-    };
-
-    Normalize_Layer(Config config)
-        : config_(config), vars(nullptr), means(nullptr), vals_hat(nullptr)
-    {
-    }
-
-    ~Normalize_Layer() {}
-
-    void ForwardCheckpoint(int bsz,  // batch * seq
-                           T* vals,
-                           const T* residual,
-                           const T* gamma,
-                           const T* betta,
-                           cudaStream_t& stream,
-                           bool preLayerNorm = false)
-    {
-        launch_bias_residual_layer_norm(vals,
-                                        residual,
-                                        gamma,
-                                        betta,
-                                        config_.epsilon,
-                                        bsz,
-                                        config_.hiddenDim,
-                                        stream,
-                                        preLayerNorm,
-                                        config_.training,
-                                        vars,
-                                        means);
-    }
-
-    void Forward(int bsz,
-                 T* vals,
-                 const T* residual,
-                 const T* gamma,
-                 const T* betta,
-                 cudaStream_t& stream,
-                 bool preLayerNorm = false)
-    {
-        launch_bias_residual_layer_norm(vals,
-                                        residual,
-                                        gamma,
-                                        betta,
-                                        config_.epsilon,
-                                        bsz,
-                                        config_.hiddenDim,
-                                        stream,
-                                        preLayerNorm,
-                                        config_.training,
-                                        vars);
-    }
-
-    void Backward(int bsz,
-                  const T* out_grad,
-                  const T* gamma,
-                  T* gamma_grad,
-                  T* betta_grad,
-                  cudaStream_t stream[2],
-                  T* inp_grad_out,
-                  const T* norm_in = nullptr)
-    {
-        launch_layerNorm_backward(out_grad,
-                                  norm_in,
-                                  vars,
-                                  means,
-                                  gamma,
-                                  gamma_grad,
-                                  betta_grad,
-                                  inp_grad_out,
-                                  bsz,
-                                  config_.hiddenDim,
-                                  stream);
-    }
-
-    void Backward(int bsz,
-                  const T* out_grad,
-                  const T* gamma,
-                  const T* betta,
-                  T* gamma_grad,
-                  T* betta_grad,
-                  cudaStream_t stream[2],
-                  T* inp_grad_out,
-                  const T* norm_out)
-    {
-        launch_layerNorm_backward(out_grad,
-                                  norm_out,
-                                  vars,
-                                  gamma,
-                                  gamma_grad,
-                                  betta_grad,
-                                  inp_grad_out,
-                                  bsz,
-                                  config_.hiddenDim,
-                                  stream,
-                                  !config_.useMean,
-                                  betta);
-    }
-
-    void BackwardFusedAdd(int bsz,
-                          const T* out_grad1,
-                          const T* out_grad2,
-                          const T* gamma,
-                          T* gamma_grad,
-                          T* betta_grad,
-                          cudaStream_t stream[2],
-                          T* inp_grad_out,
-                          const T* norm_in = nullptr)
-    {
-        launch_layerNorm_backward_fused_add(out_grad1,
-                                            out_grad2,
-                                            norm_in,
-                                            vars,
-                                            means,
-                                            gamma,
-                                            gamma_grad,
-                                            betta_grad,
-                                            inp_grad_out,
-                                            bsz,
-                                            config_.hiddenDim,
-                                            stream);
-    }
-
-    void BackwardFusedAdd(int bsz,
-                          const T* out_grad1,
-                          const T* out_grad2,
-                          const T* gamma,
-                          const T* betta,
-                          T* gamma_grad,
-                          T* betta_grad,
-                          cudaStream_t stream[2],
-                          T* inp_grad_out,
-                          const T* norm_out)
-    {
-        launch_layerNorm_backward_fused_add(out_grad1,
-                                            out_grad2,
-                                            norm_out,
-                                            vars,
-                                            gamma,
-                                            gamma_grad,
-                                            betta_grad,
-                                            inp_grad_out,
-                                            bsz,
-                                            config_.hiddenDim,
-                                            stream,
-                                            !config_.useMean,
-                                            betta);
-    }
-
-    inline bool UseMean() const { return config_.useMean; }
-
-    inline void SetVar(T* variance)
-    {
-        if (!variance) { throw std::runtime_error("Normalize variance is null."); }
-        vars = variance;
-    }
-
-    inline void SetMean(T* mean)
-    {
-        if (!mean) { throw std::runtime_error("Normalize mean is null."); }
-        means = mean;
-    }
-
-private:
-    Config config_;
-    T* vars;
-    T* means;
-    T* vals_hat;
-};
diff --git a/deepspeed/ops/csrc/includes/normalize_layer_hip.h b/deepspeed/ops/csrc/includes/normalize_layer_hip.h
deleted file mode 100644
index 41702762d3f388c6c8a3346e0c2b8f219b20e922..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/normalize_layer_hip.h
+++ /dev/null
@@ -1,203 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <stdio.h>
-#include <fstream>
-#include "custom_hip_layers.h"
-
-using namespace std;
-
-template <typename T>
-class Normalize_Layer {
-public:
-    struct Config {
-        uint32_t batchSize;
-        uint32_t seqLength;
-        uint32_t hiddenDim;
-        float epsilon;
-        bool training;
-        bool useMean;
-        Config(uint32_t batch,
-               uint32_t seq,
-               uint32_t h,
-               float epsilon = 1e-12,
-               bool training = true,
-               bool useMean = true)
-            : batchSize(batch),
-              seqLength(seq),
-              hiddenDim(h),
-              epsilon(epsilon),
-              training(training),
-              useMean(useMean)
-        {
-        }
-    };
-
-    Normalize_Layer(Config config)
-        : config_(config), vars(nullptr), means(nullptr), vals_hat(nullptr)
-    {
-    }
-
-    ~Normalize_Layer() {}
-
-    void ForwardCheckpoint(int bsz,  // batch * seq
-                           T* vals,
-                           const T* residual,
-                           const T* gamma,
-                           const T* betta,
-                           hipStream_t& stream,
-                           bool preLayerNorm = false)
-    {
-        launch_bias_residual_layer_norm(vals,
-                                        residual,
-                                        gamma,
-                                        betta,
-                                        config_.epsilon,
-                                        bsz,
-                                        config_.hiddenDim,
-                                        stream,
-                                        preLayerNorm,
-                                        config_.training,
-                                        vars,
-                                        means);
-    }
-
-    void Forward(int bsz,
-                 T* vals,
-                 const T* residual,
-                 const T* gamma,
-                 const T* betta,
-                 hipStream_t& stream,
-                 bool preLayerNorm = false)
-    {
-        launch_bias_residual_layer_norm(vals,
-                                        residual,
-                                        gamma,
-                                        betta,
-                                        config_.epsilon,
-                                        bsz,
-                                        config_.hiddenDim,
-                                        stream,
-                                        preLayerNorm,
-                                        config_.training,
-                                        vars);
-    }
-
-    void Backward(int bsz,
-                  const T* out_grad,
-                  const T* gamma,
-                  T* gamma_grad,
-                  T* betta_grad,
-                  hipStream_t stream[2],
-                  T* inp_grad_out,
-                  const T* norm_in = nullptr)
-    {
-        launch_layerNorm_backward(out_grad,
-                                  norm_in,
-                                  vars,
-                                  means,
-                                  gamma,
-                                  gamma_grad,
-                                  betta_grad,
-                                  inp_grad_out,
-                                  bsz,
-                                  config_.hiddenDim,
-                                  stream);
-    }
-
-    void Backward(int bsz,
-                  const T* out_grad,
-                  const T* gamma,
-                  const T* betta,
-                  T* gamma_grad,
-                  T* betta_grad,
-                  hipStream_t stream[2],
-                  T* inp_grad_out,
-                  const T* norm_out)
-    {
-        launch_layerNorm_backward(out_grad,
-                                  norm_out,
-                                  vars,
-                                  gamma,
-                                  gamma_grad,
-                                  betta_grad,
-                                  inp_grad_out,
-                                  bsz,
-                                  config_.hiddenDim,
-                                  stream,
-                                  !config_.useMean,
-                                  betta);
-    }
-
-    void BackwardFusedAdd(int bsz,
-                          const T* out_grad1,
-                          const T* out_grad2,
-                          const T* gamma,
-                          T* gamma_grad,
-                          T* betta_grad,
-                          hipStream_t stream[2],
-                          T* inp_grad_out,
-                          const T* norm_in = nullptr)
-    {
-        launch_layerNorm_backward_fused_add(out_grad1,
-                                            out_grad2,
-                                            norm_in,
-                                            vars,
-                                            means,
-                                            gamma,
-                                            gamma_grad,
-                                            betta_grad,
-                                            inp_grad_out,
-                                            bsz,
-                                            config_.hiddenDim,
-                                            stream);
-    }
-
-    void BackwardFusedAdd(int bsz,
-                          const T* out_grad1,
-                          const T* out_grad2,
-                          const T* gamma,
-                          const T* betta,
-                          T* gamma_grad,
-                          T* betta_grad,
-                          hipStream_t stream[2],
-                          T* inp_grad_out,
-                          const T* norm_out)
-    {
-        launch_layerNorm_backward_fused_add(out_grad1,
-                                            out_grad2,
-                                            norm_out,
-                                            vars,
-                                            gamma,
-                                            gamma_grad,
-                                            betta_grad,
-                                            inp_grad_out,
-                                            bsz,
-                                            config_.hiddenDim,
-                                            stream,
-                                            !config_.useMean,
-                                            betta);
-    }
-
-    inline bool UseMean() const { return config_.useMean; }
-
-    inline void SetVar(T* variance)
-    {
-        if (!variance) { throw std::runtime_error("Normalize variance is null."); }
-        vars = variance;
-    }
-
-    inline void SetMean(T* mean)
-    {
-        if (!mean) { throw std::runtime_error("Normalize mean is null."); }
-        means = mean;
-    }
-
-private:
-    Config config_;
-    T* vars;
-    T* means;
-    T* vals_hat;
-};
diff --git a/deepspeed/ops/csrc/includes/quantizer.h b/deepspeed/ops/csrc/includes/quantizer.h
deleted file mode 100644
index 79eeb14e2a0613255c61f647df44f04fbe9df6c6..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/quantizer.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#pragma once
-
-#include <cooperative_groups.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <cassert>
-#include <iostream>
diff --git a/deepspeed/ops/csrc/includes/quantizer_hip.h b/deepspeed/ops/csrc/includes/quantizer_hip.h
deleted file mode 100644
index 7fbbbaf2c5c67bf388ab93e8a57cbf575e72489b..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/quantizer_hip.h
+++ /dev/null
@@ -1,10 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#include <cooperative_groups.h>
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <cassert>
-#include <iostream>
diff --git a/deepspeed/ops/csrc/includes/simd.h b/deepspeed/ops/csrc/includes/simd.h
deleted file mode 100644
index 44c4da83e01c02dc664aa712fe6847be0b3f6aa2..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/simd.h
+++ /dev/null
@@ -1,137 +0,0 @@
-#pragma once
-
-#if (__x86_64__ || __i386__)
-#include <cpuid.h>
-#include <x86intrin.h>
-#endif
-
-#define TILE (128 * 1024 * 1024)
-#if defined(__AVX512__) or defined(__AVX256__)
-
-#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
-
-#if defined(__AVX512__)
-#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
-#define SIMD_LOAD(x) _mm512_loadu_ps(x)
-#define SIMD_SET(x) _mm512_set1_ps(x)
-#define SIMD_ADD(x, y) _mm512_add_ps(x, y)
-#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
-#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
-#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
-#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
-#define SIMD_WIDTH 16
-
-#define SIMD_LOAD2(x, h) \
-    ((h) ? _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)x)) : _mm512_loadu_ps(x))
-#define SIMD_STORE2(x, d, h)                                                                      \
-    ((h) ? _mm256_store_ps(x, _mm256_castsi256_ps(_mm512_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
-         : _mm512_storeu_ps(x, d))
-
-#define INTV __m256i
-#elif defined(__AVX256__)
-#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
-#define SIMD_LOAD(x) _mm256_loadu_ps(x)
-#define SIMD_SET(x) _mm256_set1_ps(x)
-#define SIMD_ADD(x, y) _mm256_add_ps(x, y)
-#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
-#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
-#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
-#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
-#define SIMD_WIDTH 8
-#define SIMD_LOAD2(x, h) \
-    ((h) ? _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)x)) : _mm256_loadu_ps(x))
-
-#define SIMD_STORE2(x, d, h)                                                                \
-    ((h) ? _mm_store_ps(x, _mm_castsi128_ps(_mm256_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
-         : _mm256_storeu_ps(x, d))
-
-#define INTV __m128i
-#endif
-
-union AVX_Data {
-#if defined(__AVX512__)
-    __m512 data;
-#elif defined(__AVX256__)
-    __m256 data;
-#endif
-    // float data_f[16];
-};
-
-template <int span>
-inline void simd_store(float* dst, AVX_Data* src, bool half_precision)
-{
-#pragma unroll
-    for (size_t i = 0; i < span; ++i) {
-        SIMD_STORE2(dst + SIMD_WIDTH * i, src[i].data, half_precision);
-    }
-}
-template <int span>
-inline void simd_load(AVX_Data* dst, float* src, bool half_precision)
-{
-#pragma unroll
-    for (size_t i = 0; i < span; ++i) {
-        dst[i].data = SIMD_LOAD2(src + SIMD_WIDTH * i, half_precision);
-    }
-}
-template <int span>
-inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data src_m_r, AVX_Data* src_a)
-{
-#pragma unroll
-    for (size_t i = 0; i < span; ++i) {
-        dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r.data, src_a[i].data);
-    }
-}
-template <int span>
-inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data src_m_r, AVX_Data src_a)
-{
-#pragma unroll
-    for (size_t i = 0; i < span; ++i) {
-        dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r.data, src_a.data);
-    }
-}
-template <int span>
-inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data* src_m_r, AVX_Data* src_a)
-{
-#pragma unroll
-    for (size_t i = 0; i < span; ++i) {
-        dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r[i].data, src_a[i].data);
-    }
-}
-template <int span>
-inline void simd_sqrt(AVX_Data* dst, AVX_Data* src)
-{
-#pragma unroll
-    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_SQRT(src[i].data); }
-}
-template <int span>
-inline void simd_add(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
-{
-#pragma unroll
-    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_ADD(src_a_l[i].data, src_a_r.data); }
-}
-template <int span>
-inline void simd_add(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
-{
-#pragma unroll
-    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_ADD(src_a_l[i].data, src_a_r[i].data); }
-}
-template <int span>
-inline void simd_mul(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
-{
-#pragma unroll
-    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_MUL(src_a_l[i].data, src_a_r.data); }
-}
-template <int span>
-inline void simd_mul(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
-{
-#pragma unroll
-    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_MUL(src_a_l[i].data, src_a_r[i].data); }
-}
-template <int span>
-inline void simd_div(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
-{
-#pragma unroll
-    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_DIV(src_a_l[i].data, src_a_r[i].data); }
-}
-
-#endif
diff --git a/deepspeed/ops/csrc/includes/softmax.h b/deepspeed/ops/csrc/includes/softmax.h
deleted file mode 100644
index 8d541a6fe0e691522a3add63bfd92e2620c914d3..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/softmax.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#pragma once
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include "custom_cuda_layers.h"
-
-#include <fstream>
-
-using namespace std;
-
-template <typename T>
-class Softmax {
-public:
-    struct Config {
-        size_t batchSize;
-        size_t heads;
-        size_t seq_length;
-        size_t prob_depth;
-        float temperature;
-        bool mem_alloc;
-        Config(size_t batch, size_t h, size_t seq, int prob_size = 0, bool mem_alloc = false)
-            : batchSize(batch),
-              heads(h),
-              seq_length(seq),
-              prob_depth(prob_size),
-              temperature(1.0),
-              mem_alloc(mem_alloc)
-        {
-        }
-    };
-
-    Softmax(Config config) : config_(config) {}
-
-    ~Softmax() {}
-
-    void Forward(int bsz, T* vals, const T* attn_mask, cudaStream_t& stream)
-    {
-        launch_attn_softmax<T>(vals, attn_mask, bsz, config_.heads, config_.seq_length, stream);
-    }
-
-    void Backward(int bsz, T* out_grad, const T* soft_out, cudaStream_t stream)
-    {
-        launch_attn_softmax_backward_v2<T>(
-            out_grad, soft_out, bsz, config_.heads, config_.seq_length, stream);
-    }
-
-    inline size_t GetProbDepth() const { return config_.prob_depth; }
-
-    inline size_t GetBatchSize() const { return config_.batchSize; }
-
-    inline size_t GetNumHeads() const { return config_.heads; }
-
-    inline size_t GetSeqLength() const { return config_.seq_length; }
-
-    inline void SetSeqLength(size_t seq_len) { config_.seq_length = seq_len; }
-
-private:
-    Config config_;
-};
diff --git a/deepspeed/ops/csrc/includes/softmax_hip.h b/deepspeed/ops/csrc/includes/softmax_hip.h
deleted file mode 100644
index 47822e6e999ada42ec761a8c80571aad30edf57c..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/softmax_hip.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <stdio.h>
-#include "custom_hip_layers.h"
-
-#include <fstream>
-
-using namespace std;
-
-template <typename T>
-class Softmax {
-public:
-    struct Config {
-        size_t batchSize;
-        size_t heads;
-        size_t seq_length;
-        size_t prob_depth;
-        float temperature;
-        bool mem_alloc;
-        Config(size_t batch, size_t h, size_t seq, int prob_size = 0, bool mem_alloc = false)
-            : batchSize(batch),
-              heads(h),
-              seq_length(seq),
-              prob_depth(prob_size),
-              temperature(1.0),
-              mem_alloc(mem_alloc)
-        {
-        }
-    };
-
-    Softmax(Config config) : config_(config) {}
-
-    ~Softmax() {}
-
-    void Forward(int bsz, T* vals, const T* attn_mask, hipStream_t& stream)
-    {
-        launch_attn_softmax<T>(vals, attn_mask, bsz, config_.heads, config_.seq_length, stream);
-    }
-
-    void Backward(int bsz, T* out_grad, const T* soft_out, hipStream_t stream)
-    {
-        launch_attn_softmax_backward_v2<T>(
-            out_grad, soft_out, bsz, config_.heads, config_.seq_length, stream);
-    }
-
-    inline size_t GetProbDepth() const { return config_.prob_depth; }
-
-    inline size_t GetBatchSize() const { return config_.batchSize; }
-
-    inline size_t GetNumHeads() const { return config_.heads; }
-
-    inline size_t GetSeqLength() const { return config_.seq_length; }
-
-    inline void SetSeqLength(size_t seq_len) { config_.seq_length = seq_len; }
-
-private:
-    Config config_;
-};
diff --git a/deepspeed/ops/csrc/includes/strided_batch_gemm.h b/deepspeed/ops/csrc/includes/strided_batch_gemm.h
deleted file mode 100644
index 037319ba0dd963de0aca8327250eeffaf870bcd0..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/strided_batch_gemm.h
+++ /dev/null
@@ -1,195 +0,0 @@
-#pragma once
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include "context.h"
-
-template <typename T>
-class StridedBatchGemm {
-public:
-    struct Config {
-        int batch_size;
-        int m;
-        int n;
-        int k;
-        float alpha;
-        float beta;
-        cublasOperation_t op_A;
-        cublasOperation_t op_B;
-        std::array<int, 3> gemm_algos;
-
-        Config(int batch,
-               int mm,
-               int nn,
-               int kk,
-               float param_alpha,
-               float param_beta,
-               cublasOperation_t opA,
-               cublasOperation_t opB,
-               const std::array<int, 3>& algos)
-            : batch_size(batch),
-              m(mm),
-              n(nn),
-              k(kk),
-              alpha(param_alpha),
-              beta(param_beta),
-              op_A(opA),
-              op_B(opB),
-              gemm_algos(algos)
-        {
-        }
-        void SetConfig(int mm, int nn, int kk)
-        {
-            m = mm;
-            n = nn;
-            k = kk;
-        }
-    };
-
-    StridedBatchGemm(const Config& config) : _config(config) {}
-
-    virtual ~StridedBatchGemm() {}
-
-    void Forward(int bsz, T* output, const T* _buffer_a, const T* _buffer_b, cublasHandle_t handle)
-    {
-        int stride_a = _config.m * _config.k;
-        int stride_b = _config.n * _config.k;
-        int stride_c = _config.m * _config.n;
-
-        cublas_strided_batched_gemm(handle,
-                                    _config.m,
-                                    _config.n,
-                                    _config.k,
-                                    &_config.alpha,
-                                    &_config.beta,
-                                    _buffer_a,
-                                    _buffer_b,
-                                    output,
-                                    _config.op_A,
-                                    _config.op_B,
-                                    stride_a,
-                                    stride_b,
-                                    stride_c,
-                                    bsz,
-#ifdef __HIP_PLATFORM_HCC__
-                                    rocblas_gemm_algo(_config.gemm_algos[0]));
-#else
-                                    cublasGemmAlgo_t(_config.gemm_algos[0]));
-#endif
-    }
-
-    void ForwardPlusSave(T* output, const T* _buffer_a, const T* _buffer_b, cublasHandle_t handle)
-    {
-        int stride_a = _config.m * _config.k;
-        int stride_b = _config.n * _config.k;
-        int stride_c = _config.m * _config.n;
-
-        cublas_strided_batched_gemm(handle,
-                                    _config.m,
-                                    _config.n,
-                                    _config.k,
-                                    &_config.alpha,
-                                    &_config.beta,
-                                    _buffer_a,
-                                    _buffer_b,
-                                    output,
-                                    _config.op_A,
-                                    _config.op_B,
-                                    stride_a,
-                                    stride_b,
-                                    stride_c,
-                                    _config.batch_size,
-#ifdef __HIP_PLATFORM_HCC__
-                                    rocblas_gemm_algo(_config.gemm_algos[0]));
-#else
-                                    cublasGemmAlgo_t(_config.gemm_algos[0]));
-#endif
-
-        k_buf = _buffer_a;
-        q_buf = _buffer_b;
-    }
-
-    void Backward(int bsz,
-                  const T* d_output,
-                  const T* _buffer_a,
-                  const T* _buffer_b,
-                  cublasHandle_t handle,
-                  T* inpGradA = nullptr,
-                  T* inpGradB = nullptr)
-    {
-        int mb = (_config.op_A == CUBLAS_OP_T ? _config.k : _config.m);
-        int kb = (_config.op_A == CUBLAS_OP_T ? _config.m : _config.k);
-
-        int stride_a = mb * _config.n;
-        int stride_b = _config.n * kb;
-        int stride_c = _config.m * _config.k;
-
-        // B need to transpose.
-        cublasOperation_t op_b = (_config.op_B == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
-
-        // Calculate d_A.
-        cublas_strided_batched_gemm(handle,
-                                    mb,
-                                    kb,
-                                    _config.n,
-                                    &_config.alpha,
-                                    &_config.beta,
-                                    (_config.op_A == CUBLAS_OP_T ? _buffer_b : d_output),
-                                    (_config.op_A == CUBLAS_OP_T ? d_output : _buffer_b),
-                                    inpGradA,
-                                    CUBLAS_OP_N,
-                                    op_b,
-                                    stride_a,
-                                    stride_b,
-                                    stride_c,
-                                    bsz,
-#ifdef __HIP_PLATFORM_HCC__
-                                    rocblas_gemm_algo(_config.gemm_algos[1]));
-#else
-                                    cublasGemmAlgo_t(_config.gemm_algos[1]));
-#endif
-
-        // A need to transpose.
-        cublasOperation_t op_a = (_config.op_A == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
-
-        stride_a = _config.m * _config.k;
-        stride_b = _config.m * _config.n;
-        stride_c = _config.n * _config.k;
-
-        // Calculate d_B.
-        cublas_strided_batched_gemm(handle,
-                                    _config.k,
-                                    _config.n,
-                                    _config.m,
-                                    &_config.alpha,
-                                    &_config.beta,
-                                    _buffer_a,
-                                    d_output,
-                                    inpGradB,
-                                    op_a,
-                                    CUBLAS_OP_N,
-                                    stride_a,
-                                    stride_b,
-                                    stride_c,
-                                    bsz,
-#ifdef __HIP_PLATFORM_HCC__
-                                    rocblas_gemm_algo(_config.gemm_algos[2]));
-#else
-                                    cublasGemmAlgo_t(_config.gemm_algos[2]));
-#endif
-    }
-
-    inline int GetN() const { return _config.k; }
-
-    inline const T* GetBufferA() const { return k_buf; }
-
-    inline const T* GetBufferB() const { return q_buf; }
-
-    inline void SetConfig(int m, int n, int k) { _config.SetConfig(m, n, k); }
-
-private:
-    Config _config;
-    const T* q_buf;
-    const T* k_buf;
-};
diff --git a/deepspeed/ops/csrc/includes/strided_batch_gemm_hip.h b/deepspeed/ops/csrc/includes/strided_batch_gemm_hip.h
deleted file mode 100644
index 9db208dc7230033e9681b7e4d2e0b651c9f458cd..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/strided_batch_gemm_hip.h
+++ /dev/null
@@ -1,196 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <stdio.h>
-#include "context_hip.h"
-
-template <typename T>
-class StridedBatchGemm {
-public:
-    struct Config {
-        int batch_size;
-        int m;
-        int n;
-        int k;
-        float alpha;
-        float beta;
-        rocblas_operation op_A;
-        rocblas_operation op_B;
-        std::array<int, 3> gemm_algos;
-
-        Config(int batch,
-               int mm,
-               int nn,
-               int kk,
-               float param_alpha,
-               float param_beta,
-               rocblas_operation opA,
-               rocblas_operation opB,
-               const std::array<int, 3>& algos)
-            : batch_size(batch),
-              m(mm),
-              n(nn),
-              k(kk),
-              alpha(param_alpha),
-              beta(param_beta),
-              op_A(opA),
-              op_B(opB),
-              gemm_algos(algos)
-        {
-        }
-        void SetConfig(int mm, int nn, int kk)
-        {
-            m = mm;
-            n = nn;
-            k = kk;
-        }
-    };
-
-    StridedBatchGemm(const Config& config) : _config(config) {}
-
-    virtual ~StridedBatchGemm() {}
-
-    void Forward(int bsz, T* output, const T* _buffer_a, const T* _buffer_b, rocblas_handle handle)
-    {
-        int stride_a = _config.m * _config.k;
-        int stride_b = _config.n * _config.k;
-        int stride_c = _config.m * _config.n;
-
-        cublas_strided_batched_gemm(handle,
-                                    _config.m,
-                                    _config.n,
-                                    _config.k,
-                                    &_config.alpha,
-                                    &_config.beta,
-                                    _buffer_a,
-                                    _buffer_b,
-                                    output,
-                                    _config.op_A,
-                                    _config.op_B,
-                                    stride_a,
-                                    stride_b,
-                                    stride_c,
-                                    bsz,
-#ifdef __HIP_PLATFORM_HCC__
-                                    rocblas_gemm_algo(_config.gemm_algos[0]));
-#else
-                                    cublasGemmAlgo_t(_config.gemm_algos[0]));
-#endif
-    }
-
-    void ForwardPlusSave(T* output, const T* _buffer_a, const T* _buffer_b, rocblas_handle handle)
-    {
-        int stride_a = _config.m * _config.k;
-        int stride_b = _config.n * _config.k;
-        int stride_c = _config.m * _config.n;
-
-        cublas_strided_batched_gemm(handle,
-                                    _config.m,
-                                    _config.n,
-                                    _config.k,
-                                    &_config.alpha,
-                                    &_config.beta,
-                                    _buffer_a,
-                                    _buffer_b,
-                                    output,
-                                    _config.op_A,
-                                    _config.op_B,
-                                    stride_a,
-                                    stride_b,
-                                    stride_c,
-                                    _config.batch_size,
-#ifdef __HIP_PLATFORM_HCC__
-                                    rocblas_gemm_algo(_config.gemm_algos[0]));
-#else
-                                    cublasGemmAlgo_t(_config.gemm_algos[0]));
-#endif
-
-        k_buf = _buffer_a;
-        q_buf = _buffer_b;
-    }
-
-    void Backward(int bsz,
-                  const T* d_output,
-                  const T* _buffer_a,
-                  const T* _buffer_b,
-                  rocblas_handle handle,
-                  T* inpGradA = nullptr,
-                  T* inpGradB = nullptr)
-    {
-        int mb = (_config.op_A == rocblas_operation_transpose ? _config.k : _config.m);
-        int kb = (_config.op_A == rocblas_operation_transpose ? _config.m : _config.k);
-
-        int stride_a = mb * _config.n;
-        int stride_b = _config.n * kb;
-        int stride_c = _config.m * _config.k;
-
-        // B need to transpose.
-        rocblas_operation op_b = (_config.op_B == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
-
-        // Calculate d_A.
-        cublas_strided_batched_gemm(handle,
-                                    mb,
-                                    kb,
-                                    _config.n,
-                                    &_config.alpha,
-                                    &_config.beta,
-                                    (_config.op_A == rocblas_operation_transpose ? _buffer_b : d_output),
-                                    (_config.op_A == rocblas_operation_transpose ? d_output : _buffer_b),
-                                    inpGradA,
-                                    rocblas_operation_none,
-                                    op_b,
-                                    stride_a,
-                                    stride_b,
-                                    stride_c,
-                                    bsz,
-#ifdef __HIP_PLATFORM_HCC__
-                                    rocblas_gemm_algo(_config.gemm_algos[1]));
-#else
-                                    cublasGemmAlgo_t(_config.gemm_algos[1]));
-#endif
-
-        // A need to transpose.
-        rocblas_operation op_a = (_config.op_A == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
-
-        stride_a = _config.m * _config.k;
-        stride_b = _config.m * _config.n;
-        stride_c = _config.n * _config.k;
-
-        // Calculate d_B.
-        cublas_strided_batched_gemm(handle,
-                                    _config.k,
-                                    _config.n,
-                                    _config.m,
-                                    &_config.alpha,
-                                    &_config.beta,
-                                    _buffer_a,
-                                    d_output,
-                                    inpGradB,
-                                    op_a,
-                                    rocblas_operation_none,
-                                    stride_a,
-                                    stride_b,
-                                    stride_c,
-                                    bsz,
-#ifdef __HIP_PLATFORM_HCC__
-                                    rocblas_gemm_algo(_config.gemm_algos[2]));
-#else
-                                    cublasGemmAlgo_t(_config.gemm_algos[2]));
-#endif
-    }
-
-    inline int GetN() const { return _config.k; }
-
-    inline const T* GetBufferA() const { return k_buf; }
-
-    inline const T* GetBufferB() const { return q_buf; }
-
-    inline void SetConfig(int m, int n, int k) { _config.SetConfig(m, n, k); }
-
-private:
-    Config _config;
-    const T* q_buf;
-    const T* k_buf;
-};
diff --git a/deepspeed/ops/csrc/includes/type_shim.h b/deepspeed/ops/csrc/includes/type_shim.h
deleted file mode 100644
index 4f4e7a539ac15d6931a6ac3dfd541c6bd2f6760d..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/type_shim.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */
-#include <ATen/ATen.h>
-
-// Forward/backward compatibility hack around
-// https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
-// pending more future-proof guidance from upstream.
-// struct TypeShim
-// {
-//   const at::Type& payload;
-//   TypeShim(const at::Type& type) : payload(type) {}
-//   // Enable trivial conversion to a const at::Type& for pre-3aeb78
-//   operator const at::Type&(){ return payload; };
-//   // Enable dispatch switch statements to take *this directly for  post-3aeb78
-//   //operator at::ScalarType(){ return payload.; };
-// };
-
-#define DISPATCH_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)                          \
-    switch (TYPE) {                                                              \
-        case at::ScalarType::Float: {                                            \
-            using scalar_t_##LEVEL = float;                                      \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        case at::ScalarType::Half: {                                             \
-            using scalar_t_##LEVEL = at::Half;                                   \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        case at::ScalarType::BFloat16: {                                         \
-            using scalar_t_##LEVEL = at::BFloat16;                               \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
-    }
-
-#define DISPATCH_DOUBLE_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)                   \
-    switch (TYPE) {                                                              \
-        case at::ScalarType::Double: {                                           \
-            using scalar_t_##LEVEL = double;                                     \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        case at::ScalarType::Float: {                                            \
-            using scalar_t_##LEVEL = float;                                      \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        case at::ScalarType::Half: {                                             \
-            using scalar_t_##LEVEL = at::Half;                                   \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        case at::ScalarType::BFloat16: {                                         \
-            using scalar_t_##LEVEL = at::BFloat16;                               \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
-    }
-
-#define DISPATCH_DOUBLE_AND_FLOAT(TYPE, LEVEL, NAME, ...)                        \
-    switch (TYPE) {                                                              \
-        case at::ScalarType::Double: {                                           \
-            using scalar_t_##LEVEL = double;                                     \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        case at::ScalarType::Float: {                                            \
-            using scalar_t_##LEVEL = float;                                      \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
-    }
-
-template <typename T>
-__device__ __forceinline__ T
-reduce_block_into_lanes(T* x,
-                        T val,
-                        int lanes = 1,
-                        bool share_result = false)  // lanes is intended to be <= 32.
-{
-    int tid = threadIdx.x + threadIdx.y * blockDim.x;
-    int blockSize = blockDim.x * blockDim.y;  // blockSize is intended to be a multiple of 32.
-
-    if (blockSize >= 64) {
-        x[tid] = val;
-        __syncthreads();
-    }
-
-#pragma unroll
-    for (int i = (blockSize >> 1); i >= 64; i >>= 1) {
-        if (tid < i) x[tid] = x[tid] + x[tid + i];
-        __syncthreads();
-    }
-
-    T final;
-
-    if (tid < 32) {
-        if (blockSize >= 64)
-            final = x[tid] + x[tid + 32];
-        else
-            final = val;
-            // __SYNCWARP();
-
-#pragma unroll
-        for (int i = 16; i >= lanes; i >>= 1)
-            final = final + __shfl_down_sync(0xffffffff, final, i);
-    }
-
-    if (share_result) {
-        if (tid < lanes) x[tid] = final;  // EpilogueOp
-        // Make sure the smem result is visible to all warps.
-        __syncthreads();
-    }
-
-    return final;
-}
diff --git a/deepspeed/ops/csrc/includes/type_shim_hip.h b/deepspeed/ops/csrc/includes/type_shim_hip.h
deleted file mode 100644
index 3bd86d52bee9f5cbe3ffed6da9f3af00f851899b..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/includes/type_shim_hip.h
+++ /dev/null
@@ -1,121 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-/* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */
-#include <ATen/ATen.h>
-
-// Forward/backward compatibility hack around
-// https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
-// pending more future-proof guidance from upstream.
-// struct TypeShim
-// {
-//   const at::Type& payload;
-//   TypeShim(const at::Type& type) : payload(type) {}
-//   // Enable trivial conversion to a const at::Type& for pre-3aeb78
-//   operator const at::Type&(){ return payload; };
-//   // Enable dispatch switch statements to take *this directly for  post-3aeb78
-//   //operator at::ScalarType(){ return payload.; };
-// };
-
-#define DISPATCH_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)                          \
-    switch (TYPE) {                                                              \
-        case at::ScalarType::Float: {                                            \
-            using scalar_t_##LEVEL = float;                                      \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        case at::ScalarType::Half: {                                             \
-            using scalar_t_##LEVEL = at::Half;                                   \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        case at::ScalarType::BFloat16: {                                         \
-            using scalar_t_##LEVEL = at::BFloat16;                               \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
-    }
-
-#define DISPATCH_DOUBLE_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)                   \
-    switch (TYPE) {                                                              \
-        case at::ScalarType::Double: {                                           \
-            using scalar_t_##LEVEL = double;                                     \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        case at::ScalarType::Float: {                                            \
-            using scalar_t_##LEVEL = float;                                      \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        case at::ScalarType::Half: {                                             \
-            using scalar_t_##LEVEL = at::Half;                                   \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        case at::ScalarType::BFloat16: {                                         \
-            using scalar_t_##LEVEL = at::BFloat16;                               \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
-    }
-
-#define DISPATCH_DOUBLE_AND_FLOAT(TYPE, LEVEL, NAME, ...)                        \
-    switch (TYPE) {                                                              \
-        case at::ScalarType::Double: {                                           \
-            using scalar_t_##LEVEL = double;                                     \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        case at::ScalarType::Float: {                                            \
-            using scalar_t_##LEVEL = float;                                      \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
-    }
-
-template <typename T>
-__device__ __forceinline__ T
-reduce_block_into_lanes(T* x,
-                        T val,
-                        int lanes = 1,
-                        bool share_result = false)  // lanes is intended to be <= 32.
-{
-    int tid = threadIdx.x + threadIdx.y * blockDim.x;
-    int blockSize = blockDim.x * blockDim.y;  // blockSize is intended to be a multiple of 32.
-
-    if (blockSize >= 64) {
-        x[tid] = val;
-        __syncthreads();
-    }
-
-#pragma unroll
-    for (int i = (blockSize >> 1); i >= 64; i >>= 1) {
-        if (tid < i) x[tid] = x[tid] + x[tid + i];
-        __syncthreads();
-    }
-
-    T final;
-
-    if (tid < 32) {
-        if (blockSize >= 64)
-            final = x[tid] + x[tid + 32];
-        else
-            final = val;
-            // __SYNCWARP();
-
-#pragma unroll
-        for (int i = 16; i >= lanes; i >>= 1)
-            final = final + __shfl_down_sync(0xffffffff, final, i);
-    }
-
-    if (share_result) {
-        if (tid < lanes) x[tid] = final;  // EpilogueOp
-        // Make sure the smem result is visible to all warps.
-        __syncthreads();
-    }
-
-    return final;
-}
diff --git a/deepspeed/ops/csrc/lamb/fused_lamb_cuda.cpp b/deepspeed/ops/csrc/lamb/fused_lamb_cuda.cpp
deleted file mode 100644
index 7a142b13b00ccafbc102b5217c9567ec42384af7..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/lamb/fused_lamb_cuda.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright 2019 The Microsoft DeepSpeed Team */
-#include <torch/extension.h>
-
-// CUDA forward declaration
-void fused_lamb_cuda(at::Tensor& p,
-                     at::Tensor& p_copy,
-                     at::Tensor& m,
-                     at::Tensor& v,
-                     at::Tensor& g,
-                     float lr,
-                     float beta1,
-                     float beta2,
-                     float max_coeff,
-                     float min_coeff,
-                     float eps,
-                     float grad_scale,
-                     int step,
-                     int mode,
-                     int bias_correction,
-                     float decay,
-                     at::Tensor& w_l2_i,
-                     at::Tensor& u_l2_i,
-                     at::Tensor& lamb_coeff_val);
-
-#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) \
-    CHECK_CUDA(x);     \
-    CHECK_CONTIGUOUS(x)
-
-// C++ interface
-at::Tensor lamb(at::Tensor& p,
-                at::Tensor& p_copy,
-                at::Tensor& m,
-                at::Tensor& v,
-                at::Tensor& g,
-                float lr,
-                float beta1,
-                float beta2,
-                float max_coeff,
-                float min_coeff,
-                float eps,
-                float grad_scale,
-                int step,
-                int mode,
-                int bias_correction,
-                float decay)
-{
-    CHECK_INPUT(p);
-    if (p_copy.numel() > 0) CHECK_INPUT(p_copy);
-    CHECK_INPUT(m);
-    CHECK_INPUT(v);
-    CHECK_INPUT(g);
-    int64_t num_elem = p.numel();
-    AT_ASSERTM(m.numel() == num_elem, "number of elements in m and p tensors should be equal");
-    AT_ASSERTM(v.numel() == num_elem, "number of elements in v and p tensors should be equal");
-    AT_ASSERTM(g.numel() == num_elem, "number of elements in g and p tensors should be equal");
-    AT_ASSERTM(
-        p_copy.numel() == num_elem || p_copy.numel() == 0,
-        "number of elements in p_copy and p tensors should be equal, or p_copy should be empty");
-
-    // intermediate for weight L2 reduction
-    // make sure that the threads per block is at least 512 during the kernel launch otherwise the
-    // behaviour is unexpected
-    at::Tensor w_l2_i = at::empty(
-        {512},
-        p.options().dtype(p.type().scalarType() == at::ScalarType::Half ? at::ScalarType::Float
-                                                                        : p.type().scalarType()));
-
-    // intermediate for update L2 reduction
-    // make sure that the threads per block is at least 512 during the kernel launch otherwise the
-    // behaviour is unexpected
-    at::Tensor u_l2_i = at::empty(
-        {512},
-        p.options().dtype(p.type().scalarType() == at::ScalarType::Half ? at::ScalarType::Float
-                                                                        : p.type().scalarType()));
-
-    at::Tensor lamb_coeff_val = at::empty(
-        {1},
-        p.options().dtype(p.type().scalarType() == at::ScalarType::Half ? at::ScalarType::Float
-                                                                        : p.type().scalarType()));
-
-    fused_lamb_cuda(p,
-                    p_copy,
-                    m,
-                    v,
-                    g,
-                    lr,
-                    beta1,
-                    beta2,
-                    max_coeff,
-                    min_coeff,
-                    eps,
-                    grad_scale,
-                    step,
-                    mode,
-                    bias_correction,
-                    decay,
-                    w_l2_i,
-                    u_l2_i,
-                    lamb_coeff_val);
-
-    return lamb_coeff_val;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("lamb", &lamb, "Adam optimized CUDA implementation with LAMB.");
-}
diff --git a/deepspeed/ops/csrc/lamb/fused_lamb_cuda_kernel.cu b/deepspeed/ops/csrc/lamb/fused_lamb_cuda_kernel.cu
deleted file mode 100644
index c76632362cb3e12abe5ac95a6b8889ea25a61697..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ /dev/null
@@ -1,474 +0,0 @@
-/* Copyright 2019 The Microsoft DeepSpeed Team */
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <stdio.h>
-#include <cmath>
-#include "ATen/ATen.h"
-#include "ATen/TensorUtils.h"
-#include "ATen/cuda/CUDAContext.h"
-#include "ATen/cuda/detail/IndexUtils.cuh"
-//#include "ATen/Type.h"
-#include "ATen/AccumulateType.h"
-
-#include <iostream>
-
-//#include <helper_functions.h>
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-#include <hip/hip_cooperative_groups.h>
-#else
-#include <cooperative_groups.h>
-#endif
-#include <cuda_runtime_api.h>
-#include <stdio.h>
-
-namespace cg = cooperative_groups;
-
-// Utility class used to avoid linker errors with extern
-// unsized shared memory arrays with templated type
-namespace {
-// This is the un-specialized struct.  Note that we prevent instantiation of this
-// struct by putting an undefined symbol in the function body so it won't compile.
-template <typename T>
-struct SharedMemory {
-    // Ensure that we won't compile any un-specialized types
-    __device__ inline operator T*()
-    {
-#ifndef _WIN32
-        extern __device__ void error(void);
-        error();
-#endif
-        return NULL;
-    }
-};
-
-template <>
-struct SharedMemory<float> {
-    __device__ inline operator float*()
-    {
-        extern __shared__ float s_float[];
-        return s_float;
-    }
-};
-
-template <>
-struct SharedMemory<double> {
-    __device__ inline operator double*()
-    {
-        extern __shared__ double s_double[];
-        return s_double;
-    }
-};
-}  // namespace
-
-#include "type_shim.h"
-
-typedef enum {
-    ADAM_MODE_0 = 0,  // eps under square root
-    ADAM_MODE_1 = 1   // eps outside square root
-} adamMode_t;
-
-// s_a and s_b are in shared memory
-// g_a and g_b are in shared memory
-template <typename T, int blockSize>
-__device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
-{
-    // Handle to thread block group
-    cg::thread_block cta = cg::this_thread_block();
-
-    // perform block reduction in shared memory,
-    unsigned int tid = cta.thread_rank();
-
-    T a_sum = s_a[tid];
-    T b_sum = s_b[tid];
-
-    cg::sync(cta);
-
-    // do reduction in shared mem
-    if ((blockSize >= 512) && (tid < 256)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 256];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 256];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 256) && (tid < 128)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 128];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 128];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 128) && (tid < 64)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 64];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 64];
-    }
-
-    cg::sync(cta);
-
-#if (__CUDA_ARCH__ >= 300)
-    if (tid < 32) {
-        cg::coalesced_group active = cg::coalesced_threads();
-
-        // Fetch final intermediate sum from 2nd warp
-        if (blockSize >= 64) {
-            a_sum = a_sum + s_a[tid + 32];
-            b_sum = b_sum + s_b[tid + 32];
-        }
-
-        // Reduce final warp using shuffle
-        for (int offset = warpSize / 2; offset > 0; offset /= 2) {
-            a_sum += active.shfl_down(a_sum, offset);
-            b_sum += active.shfl_down(b_sum, offset);
-        }
-    }
-#else
-    if ((blockSize >= 64) && (tid < 32)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 32];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 32];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 32) && (tid < 16)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 16];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 16];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 16) && (tid < 8)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 8];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 8];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 8) && (tid < 4)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 4];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 4];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 4) && (tid < 2)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 2];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 2];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 2) && (tid < 1)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 1];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 1];
-    }
-
-    cg::sync(cta);
-
-#endif
-
-    // write result for this block to global mem
-    if (tid == 0) {
-        g_a[blockIdx.x] = (T)a_sum;
-        g_b[blockIdx.x] = (T)b_sum;
-    }
-}
-
-template <typename T, int blockSize>
-__device__ void reduce_two_vectors_in_register(T a, T b, T* g_a, T* g_b)
-{
-    const int threadIdInBlock = cg::this_thread_block().thread_rank();
-
-    T* s_a = SharedMemory<T>();
-    T* s_b = SharedMemory<T>() + cg::this_thread_block().size();
-
-    s_a[threadIdInBlock] = a;
-    s_b[threadIdInBlock] = b;
-
-    reduce_block_in_shared_memory<T, blockSize>(s_a, s_b, g_a, g_b);
-}
-
-template <typename T, typename GRAD_T, int blockSize>
-__global__ void lamb_cuda_kernel_part1(
-    T* __restrict__ p,
-    GRAD_T* __restrict__ p_copy,  // For mixed precision training, pass NULL if not needed
-    T* __restrict__ m,
-    T* __restrict__ v,
-    const GRAD_T* __restrict__ g,
-    const float b1,
-    const float b2,
-    const float eps,
-    const float grad_scale,
-    const float step_size,
-    const size_t tsize,
-    adamMode_t mode,
-    const float decay,
-    T* __restrict__ w_l2_i,
-    T* __restrict__ u_l2_i)
-{
-    // Assuming 2D grids and 2D blocks
-    const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
-    const int threadsPerBlock = blockDim.x * blockDim.y;
-    const int threadIdInBlock = cg::this_thread_block().thread_rank();
-    const int i = (blockId * threadsPerBlock + threadIdInBlock);
-    const int totThreads = gridDim.x * gridDim.y * threadsPerBlock;
-
-    T reg_w = 0;
-    T reg_u = 0;
-
-    for (int j = i; j < tsize; j += totThreads) {
-        T scaled_grad = g[j] / grad_scale;
-        T pj = p[j];
-        m[j] = b1 * m[j] + (1 - b1) * scaled_grad;
-        v[j] = b2 * v[j] + (1 - b2) * scaled_grad * scaled_grad;
-        float denom;
-        if (mode == ADAM_MODE_0)
-            denom = sqrtf(v[j] + eps);
-        else  // Mode 1
-            denom = sqrtf(v[j]) + eps;
-        T update = (m[j] / denom) + (decay * p[j]);
-
-        reg_u += update * update;
-        reg_w += pj * pj;
-    }
-
-    reduce_two_vectors_in_register<T, blockSize>(reg_w, reg_u, w_l2_i, u_l2_i);
-}
-
-template <typename T, typename GRAD_T, int blockSize>
-__global__ void lamb_cuda_kernel_part2(const size_t tsize, T* __restrict__ g_a, T* __restrict__ g_b)
-{
-    T* s_a = SharedMemory<T>();
-    T* s_b = SharedMemory<T>() + cg::this_thread_block().size();
-
-    const int threadIdInBlock = cg::this_thread_block().thread_rank();
-
-    s_a[threadIdInBlock] = g_a[threadIdInBlock];
-    s_b[threadIdInBlock] = g_b[threadIdInBlock];
-
-    if (threadIdInBlock >= tsize) {
-        s_a[threadIdInBlock] = 0.0;
-        s_b[threadIdInBlock] = 0.0;
-    }
-
-    reduce_block_in_shared_memory<T, blockSize>(s_a, s_b, g_a, g_b);
-}
-
-template <typename T, typename GRAD_T>
-__global__ void lamb_cuda_kernel_part3(
-    T* __restrict__ p,
-    GRAD_T* __restrict__ p_copy,  // For mixed precision training, pass NULL if not needed
-    T* __restrict__ m,
-    T* __restrict__ v,
-    const GRAD_T* __restrict__ g,
-    const float b1,
-    const float b2,
-    const float max_coeff,
-    const float min_coeff,
-    const float eps,
-    const float grad_scale,
-    const float step_size,
-    const size_t tsize,
-    adamMode_t mode,
-    const float decay,
-    T* __restrict__ w_l2_i,
-    T* __restrict__ u_l2_i,
-    T* __restrict__ lamb_coeff_val)
-{
-    // Assuming 2D grids and 2D blocks
-    const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
-    const int threadsPerBlock = blockDim.x * blockDim.y;
-    const int threadIdInBlock = cg::this_thread_block().thread_rank();
-    const int i = (blockId * threadsPerBlock + threadIdInBlock);
-    const int totThreads = gridDim.x * gridDim.y * threadsPerBlock;
-
-    T reg_w = sqrtf(w_l2_i[0]);
-    T reg_u = sqrtf(u_l2_i[0]);
-
-    float lamb_coeff = 1.0;
-
-    if (reg_w != 0 && reg_u != 0) {
-        lamb_coeff = reg_w / reg_u;
-        if (lamb_coeff > max_coeff) { lamb_coeff = max_coeff; }
-        if (lamb_coeff < min_coeff) { lamb_coeff = min_coeff; }
-    }
-
-    if (blockId == 0 && threadIdInBlock == 0) {
-        lamb_coeff_val[0] = lamb_coeff;
-        // printf("Cuda Lamb Coeff is %.6f \n",lamb_coeff);
-    }
-
-    for (int j = i; j < tsize; j += totThreads) {
-        T pj = (float)p[j];
-        T mj = m[j];
-        T vj = v[j];
-        float denom;
-        if (mode == ADAM_MODE_0)
-            denom = sqrtf(vj + eps);
-        else  // Mode 1
-            denom = sqrtf(vj) + eps;
-        T update = (mj / denom) + (decay * pj);
-
-        pj = pj - (step_size * lamb_coeff * update);
-        p[j] = pj;
-        if (p_copy != NULL) p_copy[j] = (GRAD_T)pj;
-    }
-}
-
-void fused_lamb_cuda(at::Tensor& p,
-                     at::Tensor& p_copy,
-                     at::Tensor& m,
-                     at::Tensor& v,
-                     at::Tensor& g,
-                     float lr,
-                     float beta1,
-                     float beta2,
-                     float max_coeff,
-                     float min_coeff,
-                     float eps,
-                     float grad_scale,
-                     int step,
-                     int mode,
-                     int bias_correction,
-                     float decay,
-                     at::Tensor& w_l2_i,
-                     at::Tensor& u_l2_i,
-                     at::Tensor& lamb_coeff)
-{
-    //        using namespace at;
-
-    // Get tensor size
-    int tsize = p.numel();
-    // Determine #threads and #blocks
-    const int threadsPerBlock = 512;
-    int num_blocks = (tsize + threadsPerBlock - 1) / threadsPerBlock;
-    if (num_blocks > 512) num_blocks = 512;
-
-    int smemsize = 0;
-
-    if (p.type().scalarType() == at::ScalarType::Double)
-        smemsize = 2 * threadsPerBlock * sizeof(double);
-    else
-        smemsize = 2 * threadsPerBlock * sizeof(float);
-
-    const dim3 blocks(num_blocks);
-    const dim3 threads(threadsPerBlock);
-
-    AT_ASSERTM(at::cuda::detail::canUse32BitIndexMath(p),
-               "parameter tensor is too large to be indexed with int32");
-    // Constants
-    float step_size = 0;
-    if (bias_correction == 1) {
-        const float bias_correction1 = 1 - std::pow(beta1, step);
-        const float bias_correction2 = 1 - std::pow(beta2, step);
-        step_size = lr * std::sqrt(bias_correction2) / bias_correction1;
-    } else {
-        step_size = lr;
-    }
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-    if (g.type().scalarType() == at::ScalarType::Half) {
-        // all other values should be fp32 for half gradients
-        AT_ASSERTM(p.type().scalarType() == at::ScalarType::Float,
-                   "expected parameter to be of float type");
-        // dispatch is done on the gradient type
-        using namespace at;  // prevents "toString is undefined" errors
-        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-            g.scalar_type(), "lamb_cuda_kernel", ([&] {
-                using accscalar_t = at::acc_type<scalar_t, true>;
-
-                lamb_cuda_kernel_part1<accscalar_t, scalar_t, threadsPerBlock>
-                    <<<blocks, threadsPerBlock, smemsize, stream>>>(
-                        p.data<accscalar_t>(),
-                        p_copy.numel() ? p_copy.data<scalar_t>() : NULL,
-                        m.data<accscalar_t>(),
-                        v.data<accscalar_t>(),
-                        g.data<scalar_t>(),
-                        beta1,
-                        beta2,
-                        eps,
-                        grad_scale,
-                        step_size,
-                        tsize,
-                        (adamMode_t)mode,
-                        decay,
-                        w_l2_i.data<accscalar_t>(),
-                        u_l2_i.data<accscalar_t>());
-
-                lamb_cuda_kernel_part2<accscalar_t, scalar_t, threadsPerBlock>
-                    <<<1, threadsPerBlock, smemsize, stream>>>(
-                        num_blocks, w_l2_i.data<accscalar_t>(), u_l2_i.data<accscalar_t>());
-
-                lamb_cuda_kernel_part3<accscalar_t, scalar_t>
-                    <<<blocks, threadsPerBlock, smemsize, stream>>>(
-                        p.data<accscalar_t>(),
-                        p_copy.numel() ? p_copy.data<scalar_t>() : NULL,
-                        m.data<accscalar_t>(),
-                        v.data<accscalar_t>(),
-                        g.data<scalar_t>(),
-                        beta1,
-                        beta2,
-                        max_coeff,
-                        min_coeff,
-                        eps,
-                        grad_scale,
-                        step_size,
-                        tsize,
-                        (adamMode_t)mode,
-                        decay,
-                        w_l2_i.data<accscalar_t>(),
-                        u_l2_i.data<accscalar_t>(),
-                        lamb_coeff.data<accscalar_t>());
-            }));
-    } else {
-        using namespace at;
-        AT_DISPATCH_FLOATING_TYPES(
-            g.scalar_type(), "lamb_cuda_kernel", ([&] {
-                lamb_cuda_kernel_part1<scalar_t, scalar_t, threadsPerBlock>
-                    <<<blocks, threadsPerBlock, smemsize, stream>>>(
-                        p.data<scalar_t>(),
-                        NULL,  // don't output p_copy for fp32, it's wasted write
-                        m.data<scalar_t>(),
-                        v.data<scalar_t>(),
-                        g.data<scalar_t>(),
-                        beta1,
-                        beta2,
-                        eps,
-                        grad_scale,
-                        step_size,
-                        tsize,
-                        (adamMode_t)mode,
-                        decay,
-                        w_l2_i.data<scalar_t>(),
-                        u_l2_i.data<scalar_t>());
-
-                lamb_cuda_kernel_part2<scalar_t, scalar_t, threadsPerBlock>
-                    <<<1, threadsPerBlock, smemsize, stream>>>(
-                        num_blocks, w_l2_i.data<scalar_t>(), u_l2_i.data<scalar_t>());
-
-                lamb_cuda_kernel_part3<scalar_t, scalar_t>
-                    <<<blocks, threadsPerBlock, smemsize, stream>>>(
-                        p.data<scalar_t>(),
-                        NULL,  // don't output p_copy for fp32, it's wasted write
-                        m.data<scalar_t>(),
-                        v.data<scalar_t>(),
-                        g.data<scalar_t>(),
-                        beta1,
-                        beta2,
-                        max_coeff,
-                        min_coeff,
-                        eps,
-                        grad_scale,
-                        step_size,
-                        tsize,
-                        (adamMode_t)mode,
-                        decay,
-                        w_l2_i.data<scalar_t>(),
-                        u_l2_i.data<scalar_t>(),
-                        lamb_coeff.data<scalar_t>());
-            }));
-    }
-    C10_CUDA_CHECK(cudaGetLastError());
-}
-
-// template __device__ void reduce_two_vectors_in_register<float,512>(float a, float b, float* g_a,
-// float* g_b, cg::grid_group &cgg);
diff --git a/deepspeed/ops/csrc/lamb/fused_lamb_hip_kernel.hip b/deepspeed/ops/csrc/lamb/fused_lamb_hip_kernel.hip
deleted file mode 100644
index 2e2bc69f6156c9a21cc2a481dc423ded17351651..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/lamb/fused_lamb_hip_kernel.hip
+++ /dev/null
@@ -1,475 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-/* Copyright 2019 The Microsoft DeepSpeed Team */
-#include <hip/hip_runtime.h>
-#include <hip/hip_runtime.h>
-#include <stdio.h>
-#include <cmath>
-#include "ATen/ATen.h"
-#include "ATen/TensorUtils.h"
-#include "ATen/hip/HIPContext.h"
-#include "ATen/hip/detail/IndexUtils.cuh"
-//#include "ATen/Type.h"
-#include "ATen/AccumulateType.h"
-
-#include <iostream>
-
-//#include <helper_functions.h>
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-#include <hip/hip_cooperative_groups.h>
-#else
-#include <cooperative_groups.h>
-#endif
-#include <hip/hip_runtime_api.h>
-#include <stdio.h>
-
-namespace cg = cooperative_groups;
-
-// Utility class used to avoid linker errors with extern
-// unsized shared memory arrays with templated type
-namespace {
-// This is the un-specialized struct.  Note that we prevent instantiation of this
-// struct by putting an undefined symbol in the function body so it won't compile.
-template <typename T>
-struct SharedMemory {
-    // Ensure that we won't compile any un-specialized types
-    __device__ inline operator T*()
-    {
-#ifndef _WIN32
-        extern __device__ void error(void);
-        error();
-#endif
-        return NULL;
-    }
-};
-
-template <>
-struct SharedMemory<float> {
-    __device__ inline operator float*()
-    {
-        HIP_DYNAMIC_SHARED( float, s_float)
-        return s_float;
-    }
-};
-
-template <>
-struct SharedMemory<double> {
-    __device__ inline operator double*()
-    {
-        HIP_DYNAMIC_SHARED( double, s_double)
-        return s_double;
-    }
-};
-}  // namespace
-
-#include "type_shim_hip.h"
-
-typedef enum {
-    ADAM_MODE_0 = 0,  // eps under square root
-    ADAM_MODE_1 = 1   // eps outside square root
-} adamMode_t;
-
-// s_a and s_b are in shared memory
-// g_a and g_b are in shared memory
-template <typename T, int blockSize>
-__device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
-{
-    // Handle to thread block group
-    cg::thread_block cta = cg::this_thread_block();
-
-    // perform block reduction in shared memory,
-    unsigned int tid = cta.thread_rank();
-
-    T a_sum = s_a[tid];
-    T b_sum = s_b[tid];
-
-    cg::sync(cta);
-
-    // do reduction in shared mem
-    if ((blockSize >= 512) && (tid < 256)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 256];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 256];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 256) && (tid < 128)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 128];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 128];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 128) && (tid < 64)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 64];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 64];
-    }
-
-    cg::sync(cta);
-
-#if (__CUDA_ARCH__ >= 300)
-    if (tid < 32) {
-        cg::coalesced_group active = cg::coalesced_threads();
-
-        // Fetch final intermediate sum from 2nd warp
-        if (blockSize >= 64) {
-            a_sum = a_sum + s_a[tid + 32];
-            b_sum = b_sum + s_b[tid + 32];
-        }
-
-        // Reduce final warp using shuffle
-        for (int offset = warpSize / 2; offset > 0; offset /= 2) {
-            a_sum += active.shfl_down(a_sum, offset);
-            b_sum += active.shfl_down(b_sum, offset);
-        }
-    }
-#else
-    if ((blockSize >= 64) && (tid < 32)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 32];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 32];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 32) && (tid < 16)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 16];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 16];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 16) && (tid < 8)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 8];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 8];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 8) && (tid < 4)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 4];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 4];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 4) && (tid < 2)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 2];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 2];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 2) && (tid < 1)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 1];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 1];
-    }
-
-    cg::sync(cta);
-
-#endif
-
-    // write result for this block to global mem
-    if (tid == 0) {
-        g_a[blockIdx.x] = (T)a_sum;
-        g_b[blockIdx.x] = (T)b_sum;
-    }
-}
-
-template <typename T, int blockSize>
-__device__ void reduce_two_vectors_in_register(T a, T b, T* g_a, T* g_b)
-{
-    const int threadIdInBlock = cg::this_thread_block().thread_rank();
-
-    T* s_a = SharedMemory<T>();
-    T* s_b = SharedMemory<T>() + cg::this_thread_block().size();
-
-    s_a[threadIdInBlock] = a;
-    s_b[threadIdInBlock] = b;
-
-    reduce_block_in_shared_memory<T, blockSize>(s_a, s_b, g_a, g_b);
-}
-
-template <typename T, typename GRAD_T, int blockSize>
-__global__ void lamb_cuda_kernel_part1(
-    T* __restrict__ p,
-    GRAD_T* __restrict__ p_copy,  // For mixed precision training, pass NULL if not needed
-    T* __restrict__ m,
-    T* __restrict__ v,
-    const GRAD_T* __restrict__ g,
-    const float b1,
-    const float b2,
-    const float eps,
-    const float grad_scale,
-    const float step_size,
-    const size_t tsize,
-    adamMode_t mode,
-    const float decay,
-    T* __restrict__ w_l2_i,
-    T* __restrict__ u_l2_i)
-{
-    // Assuming 2D grids and 2D blocks
-    const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
-    const int threadsPerBlock = blockDim.x * blockDim.y;
-    const int threadIdInBlock = cg::this_thread_block().thread_rank();
-    const int i = (blockId * threadsPerBlock + threadIdInBlock);
-    const int totThreads = gridDim.x * gridDim.y * threadsPerBlock;
-
-    T reg_w = 0;
-    T reg_u = 0;
-
-    for (int j = i; j < tsize; j += totThreads) {
-        T scaled_grad = g[j] / grad_scale;
-        T pj = p[j];
-        m[j] = b1 * m[j] + (1 - b1) * scaled_grad;
-        v[j] = b2 * v[j] + (1 - b2) * scaled_grad * scaled_grad;
-        float denom;
-        if (mode == ADAM_MODE_0)
-            denom = sqrtf(v[j] + eps);
-        else  // Mode 1
-            denom = sqrtf(v[j]) + eps;
-        T update = (m[j] / denom) + (decay * p[j]);
-
-        reg_u += update * update;
-        reg_w += pj * pj;
-    }
-
-    reduce_two_vectors_in_register<T, blockSize>(reg_w, reg_u, w_l2_i, u_l2_i);
-}
-
-template <typename T, typename GRAD_T, int blockSize>
-__global__ void lamb_cuda_kernel_part2(const size_t tsize, T* __restrict__ g_a, T* __restrict__ g_b)
-{
-    T* s_a = SharedMemory<T>();
-    T* s_b = SharedMemory<T>() + cg::this_thread_block().size();
-
-    const int threadIdInBlock = cg::this_thread_block().thread_rank();
-
-    s_a[threadIdInBlock] = g_a[threadIdInBlock];
-    s_b[threadIdInBlock] = g_b[threadIdInBlock];
-
-    if (threadIdInBlock >= tsize) {
-        s_a[threadIdInBlock] = 0.0;
-        s_b[threadIdInBlock] = 0.0;
-    }
-
-    reduce_block_in_shared_memory<T, blockSize>(s_a, s_b, g_a, g_b);
-}
-
-template <typename T, typename GRAD_T>
-__global__ void lamb_cuda_kernel_part3(
-    T* __restrict__ p,
-    GRAD_T* __restrict__ p_copy,  // For mixed precision training, pass NULL if not needed
-    T* __restrict__ m,
-    T* __restrict__ v,
-    const GRAD_T* __restrict__ g,
-    const float b1,
-    const float b2,
-    const float max_coeff,
-    const float min_coeff,
-    const float eps,
-    const float grad_scale,
-    const float step_size,
-    const size_t tsize,
-    adamMode_t mode,
-    const float decay,
-    T* __restrict__ w_l2_i,
-    T* __restrict__ u_l2_i,
-    T* __restrict__ lamb_coeff_val)
-{
-    // Assuming 2D grids and 2D blocks
-    const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
-    const int threadsPerBlock = blockDim.x * blockDim.y;
-    const int threadIdInBlock = cg::this_thread_block().thread_rank();
-    const int i = (blockId * threadsPerBlock + threadIdInBlock);
-    const int totThreads = gridDim.x * gridDim.y * threadsPerBlock;
-
-    T reg_w = sqrtf(w_l2_i[0]);
-    T reg_u = sqrtf(u_l2_i[0]);
-
-    float lamb_coeff = 1.0;
-
-    if (reg_w != 0 && reg_u != 0) {
-        lamb_coeff = reg_w / reg_u;
-        if (lamb_coeff > max_coeff) { lamb_coeff = max_coeff; }
-        if (lamb_coeff < min_coeff) { lamb_coeff = min_coeff; }
-    }
-
-    if (blockId == 0 && threadIdInBlock == 0) {
-        lamb_coeff_val[0] = lamb_coeff;
-        // printf("Cuda Lamb Coeff is %.6f \n",lamb_coeff);
-    }
-
-    for (int j = i; j < tsize; j += totThreads) {
-        T pj = (float)p[j];
-        T mj = m[j];
-        T vj = v[j];
-        float denom;
-        if (mode == ADAM_MODE_0)
-            denom = sqrtf(vj + eps);
-        else  // Mode 1
-            denom = sqrtf(vj) + eps;
-        T update = (mj / denom) + (decay * pj);
-
-        pj = pj - (step_size * lamb_coeff * update);
-        p[j] = pj;
-        if (p_copy != NULL) p_copy[j] = (GRAD_T)pj;
-    }
-}
-
-void fused_lamb_cuda(at::Tensor& p,
-                     at::Tensor& p_copy,
-                     at::Tensor& m,
-                     at::Tensor& v,
-                     at::Tensor& g,
-                     float lr,
-                     float beta1,
-                     float beta2,
-                     float max_coeff,
-                     float min_coeff,
-                     float eps,
-                     float grad_scale,
-                     int step,
-                     int mode,
-                     int bias_correction,
-                     float decay,
-                     at::Tensor& w_l2_i,
-                     at::Tensor& u_l2_i,
-                     at::Tensor& lamb_coeff)
-{
-    //        using namespace at;
-
-    // Get tensor size
-    int tsize = p.numel();
-    // Determine #threads and #blocks
-    const int threadsPerBlock = 512;
-    int num_blocks = (tsize + threadsPerBlock - 1) / threadsPerBlock;
-    if (num_blocks > 512) num_blocks = 512;
-
-    int smemsize = 0;
-
-    if (p.type().scalarType() == at::ScalarType::Double)
-        smemsize = 2 * threadsPerBlock * sizeof(double);
-    else
-        smemsize = 2 * threadsPerBlock * sizeof(float);
-
-    const dim3 blocks(num_blocks);
-    const dim3 threads(threadsPerBlock);
-
-    AT_ASSERTM(at::cuda::detail::canUse32BitIndexMath(p),
-               "parameter tensor is too large to be indexed with int32");
-    // Constants
-    float step_size = 0;
-    if (bias_correction == 1) {
-        const float bias_correction1 = 1 - ::pow(beta1, step);
-        const float bias_correction2 = 1 - ::pow(beta2, step);
-        step_size = lr * std::sqrt(bias_correction2) / bias_correction1;
-    } else {
-        step_size = lr;
-    }
-    hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-
-    if (g.type().scalarType() == at::ScalarType::Half) {
-        // all other values should be fp32 for half gradients
-        AT_ASSERTM(p.type().scalarType() == at::ScalarType::Float,
-                   "expected parameter to be of float type");
-        // dispatch is done on the gradient type
-        using namespace at;  // prevents "toString is undefined" errors
-        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-            g.scalar_type(), "lamb_cuda_kernel", ([&] {
-                using accscalar_t = at::acc_type<scalar_t, true>;
-
-               hipLaunchKernelGGL(( lamb_cuda_kernel_part1<accscalar_t, scalar_t, threadsPerBlock>)
-                    , dim3(blocks), dim3(threadsPerBlock), smemsize, stream, 
-                        p.data<accscalar_t>(),
-                        p_copy.numel() ? p_copy.data<scalar_t>() : NULL,
-                        m.data<accscalar_t>(),
-                        v.data<accscalar_t>(),
-                        g.data<scalar_t>(),
-                        beta1,
-                        beta2,
-                        eps,
-                        grad_scale,
-                        step_size,
-                        tsize,
-                        (adamMode_t)mode,
-                        decay,
-                        w_l2_i.data<accscalar_t>(),
-                        u_l2_i.data<accscalar_t>());
-
-               hipLaunchKernelGGL(( lamb_cuda_kernel_part2<accscalar_t, scalar_t, threadsPerBlock>)
-                    , dim3(1), dim3(threadsPerBlock), smemsize, stream, 
-                        num_blocks, w_l2_i.data<accscalar_t>(), u_l2_i.data<accscalar_t>());
-
-               hipLaunchKernelGGL(( lamb_cuda_kernel_part3<accscalar_t, scalar_t>)
-                    , dim3(blocks), dim3(threadsPerBlock), smemsize, stream, 
-                        p.data<accscalar_t>(),
-                        p_copy.numel() ? p_copy.data<scalar_t>() : NULL,
-                        m.data<accscalar_t>(),
-                        v.data<accscalar_t>(),
-                        g.data<scalar_t>(),
-                        beta1,
-                        beta2,
-                        max_coeff,
-                        min_coeff,
-                        eps,
-                        grad_scale,
-                        step_size,
-                        tsize,
-                        (adamMode_t)mode,
-                        decay,
-                        w_l2_i.data<accscalar_t>(),
-                        u_l2_i.data<accscalar_t>(),
-                        lamb_coeff.data<accscalar_t>());
-            }));
-    } else {
-        using namespace at;
-        AT_DISPATCH_FLOATING_TYPES(
-            g.scalar_type(), "lamb_cuda_kernel", ([&] {
-               hipLaunchKernelGGL(( lamb_cuda_kernel_part1<scalar_t, scalar_t, threadsPerBlock>)
-                    , dim3(blocks), dim3(threadsPerBlock), smemsize, stream, 
-                        p.data<scalar_t>(),
-                        NULL,  // don't output p_copy for fp32, it's wasted write
-                        m.data<scalar_t>(),
-                        v.data<scalar_t>(),
-                        g.data<scalar_t>(),
-                        beta1,
-                        beta2,
-                        eps,
-                        grad_scale,
-                        step_size,
-                        tsize,
-                        (adamMode_t)mode,
-                        decay,
-                        w_l2_i.data<scalar_t>(),
-                        u_l2_i.data<scalar_t>());
-
-               hipLaunchKernelGGL(( lamb_cuda_kernel_part2<scalar_t, scalar_t, threadsPerBlock>)
-                    , dim3(1), dim3(threadsPerBlock), smemsize, stream, 
-                        num_blocks, w_l2_i.data<scalar_t>(), u_l2_i.data<scalar_t>());
-
-               hipLaunchKernelGGL(( lamb_cuda_kernel_part3<scalar_t, scalar_t>)
-                    , dim3(blocks), dim3(threadsPerBlock), smemsize, stream, 
-                        p.data<scalar_t>(),
-                        NULL,  // don't output p_copy for fp32, it's wasted write
-                        m.data<scalar_t>(),
-                        v.data<scalar_t>(),
-                        g.data<scalar_t>(),
-                        beta1,
-                        beta2,
-                        max_coeff,
-                        min_coeff,
-                        eps,
-                        grad_scale,
-                        step_size,
-                        tsize,
-                        (adamMode_t)mode,
-                        decay,
-                        w_l2_i.data<scalar_t>(),
-                        u_l2_i.data<scalar_t>(),
-                        lamb_coeff.data<scalar_t>());
-            }));
-    }
-    C10_HIP_CHECK(hipGetLastError());
-}
-
-// template __device__ void reduce_two_vectors_in_register<float,512>(float a, float b, float* g_a,
-// float* g_b, cg::grid_group &cgg);
diff --git a/deepspeed/ops/csrc/quantization/pt_binding.cpp b/deepspeed/ops/csrc/quantization/pt_binding.cpp
deleted file mode 100644
index f76c4368a20090d1821776fed4877fa021db57c5..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/quantization/pt_binding.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-#include <ATen/cuda/CUDAContext.h>
-#include <torch/extension.h>
-#include <vector>
-#include "custom_cuda_layers.h"
-
-template <typename T>
-at::Tensor ds_quantize(at::Tensor& vals, int groups, int bits)
-{
-    auto t_size = vals.sizes();
-    int size = 1;
-    for (auto dim : t_size) size *= dim;
-
-    if ((((size / groups) - 1) / 4096 + 1) <= MAX_REG) {
-        launch_quantize_kernel(
-            (T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
-    }
-    return vals;
-}
-
-template <typename T>
-at::Tensor ds_sr_quantize(at::Tensor& vals, int groups, int bits)
-{
-    auto t_size = vals.sizes();
-    int size = 1;
-    for (auto dim : t_size) size *= dim;
-
-    if (((size / groups) / 4 / 1024) <= 256) {
-        launch_sr_quantize_kernel(
-            (T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
-    }
-    return vals;
-}
-
-template <typename T>
-at::Tensor ds_quantize_asym(at::Tensor& vals, int groups, int bits)
-{
-    auto t_size = vals.sizes();
-    int size = 1;
-    for (auto dim : t_size) size *= dim;
-
-    if ((((size / groups) - 1) / 4096 + 1) <= MAX_REG) {
-        launch_quantize_kernel_asym(
-            (T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
-    }
-    return vals;
-}
-
-template <typename T>
-at::Tensor ds_sr_quantize_asym(at::Tensor& vals, int groups, int bits)
-{
-    auto t_size = vals.sizes();
-    int size = 1;
-    for (auto dim : t_size) size *= dim;
-
-    if (((size / groups) / 4 / 1024) <= 256) {
-        launch_sr_quantize_kernel_asym(
-            (T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
-    }
-    return vals;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("ds_quantize_fp32", &ds_quantize<float>, "DeepSpeed Quantize with fp32 (CUDA)");
-    m.def("ds_quantize_fp16", &ds_quantize<__half>, "DeepSpeed Quantize with fp16 (CUDA)");
-    m.def("ds_sr_quantize_fp32", &ds_sr_quantize<float>, "DeepSpeed Quantize with fp32 (CUDA)");
-    m.def("ds_sr_quantize_fp16", &ds_sr_quantize<__half>, "DeepSpeed Quantize with fp16 (CUDA)");
-    m.def("ds_quantize_asym_fp32", &ds_quantize_asym<float>, "DeepSpeed Quantize with fp32 (CUDA)");
-    m.def(
-        "ds_quantize_asym_fp16", &ds_quantize_asym<__half>, "DeepSpeed Quantize with fp16 (CUDA)");
-    m.def("ds_sr_quantize_asym_fp32",
-          &ds_sr_quantize_asym<float>,
-          "DeepSpeed Quantize with fp32 (CUDA)");
-    m.def("ds_sr_quantize_asym_fp16",
-          &ds_sr_quantize_asym<__half>,
-          "DeepSpeed Quantize with fp16 (CUDA)");
-}
diff --git a/deepspeed/ops/csrc/quantization/pt_binding_hip.cpp b/deepspeed/ops/csrc/quantization/pt_binding_hip.cpp
deleted file mode 100644
index 25ddba1a36a985e78be34b6b7a9c0c5c7df2fea5..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/quantization/pt_binding_hip.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/hip/HIPContext.h>
-#include <torch/extension.h>
-#include <vector>
-#include "custom_hip_layers.h"
-
-template <typename T>
-at::Tensor ds_quantize(at::Tensor& vals, int groups, int bits)
-{
-    auto t_size = vals.sizes();
-    int size = 1;
-    for (auto dim : t_size) size *= dim;
-
-    if ((((size / groups) - 1) / 4096 + 1) <= MAX_REG) {
-        launch_quantize_kernel(
-            (T*)vals.data_ptr(), size, groups, bits, at::hip::getCurrentHIPStreamMasqueradingAsCUDA());
-    }
-    return vals;
-}
-
-template <typename T>
-at::Tensor ds_sr_quantize(at::Tensor& vals, int groups, int bits)
-{
-    auto t_size = vals.sizes();
-    int size = 1;
-    for (auto dim : t_size) size *= dim;
-
-    if (((size / groups) / 4 / 1024) <= 256) {
-        launch_sr_quantize_kernel(
-            (T*)vals.data_ptr(), size, groups, bits, at::hip::getCurrentHIPStreamMasqueradingAsCUDA());
-    }
-    return vals;
-}
-
-template <typename T>
-at::Tensor ds_quantize_asym(at::Tensor& vals, int groups, int bits)
-{
-    auto t_size = vals.sizes();
-    int size = 1;
-    for (auto dim : t_size) size *= dim;
-
-    if ((((size / groups) - 1) / 4096 + 1) <= MAX_REG) {
-        launch_quantize_kernel_asym(
-            (T*)vals.data_ptr(), size, groups, bits, at::hip::getCurrentHIPStreamMasqueradingAsCUDA());
-    }
-    return vals;
-}
-
-template <typename T>
-at::Tensor ds_sr_quantize_asym(at::Tensor& vals, int groups, int bits)
-{
-    auto t_size = vals.sizes();
-    int size = 1;
-    for (auto dim : t_size) size *= dim;
-
-    if (((size / groups) / 4 / 1024) <= 256) {
-        launch_sr_quantize_kernel_asym(
-            (T*)vals.data_ptr(), size, groups, bits, at::hip::getCurrentHIPStreamMasqueradingAsCUDA());
-    }
-    return vals;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("ds_quantize_fp32", &ds_quantize<float>, "DeepSpeed Quantize with fp32 (CUDA)");
-    m.def("ds_quantize_fp16", &ds_quantize<__half>, "DeepSpeed Quantize with fp16 (CUDA)");
-    m.def("ds_sr_quantize_fp32", &ds_sr_quantize<float>, "DeepSpeed Quantize with fp32 (CUDA)");
-    m.def("ds_sr_quantize_fp16", &ds_sr_quantize<__half>, "DeepSpeed Quantize with fp16 (CUDA)");
-    m.def("ds_quantize_asym_fp32", &ds_quantize_asym<float>, "DeepSpeed Quantize with fp32 (CUDA)");
-    m.def(
-        "ds_quantize_asym_fp16", &ds_quantize_asym<__half>, "DeepSpeed Quantize with fp16 (CUDA)");
-    m.def("ds_sr_quantize_asym_fp32",
-          &ds_sr_quantize_asym<float>,
-          "DeepSpeed Quantize with fp32 (CUDA)");
-    m.def("ds_sr_quantize_asym_fp16",
-          &ds_sr_quantize_asym<__half>,
-          "DeepSpeed Quantize with fp16 (CUDA)");
-}
diff --git a/deepspeed/ops/csrc/quantization/quantizer.hip b/deepspeed/ops/csrc/quantization/quantizer.hip
deleted file mode 100644
index 9134593275130a29dc43384d99e15bd2722f3e4c..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/quantization/quantizer.hip
+++ /dev/null
@@ -1,1039 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include <math.h>
-#include "custom_hip_layers.h"
-
-namespace cg = cooperative_groups;
-
-__global__ void quantize_kernel(__half* vals, int group_size, int num_bits)
-{
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int gid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-    int id = threadIdx.x;
-
-    float2* vals_cast = reinterpret_cast<float2*>(vals);
-
-    float2 data[MAX_REG];
-
-    int group_id = blockIdx.x;
-
-    {
-        int group_index = id;
-        int reg_count = 0;
-        int offset = group_id * group_size;
-        float max = -10000.0;
-
-        while (group_index < group_size && reg_count < MAX_REG) {
-            data[reg_count] = vals_cast[offset + group_index];
-            __half* data_h = reinterpret_cast<__half*>(&data[reg_count]);
-
-            if (abs((float)data_h[0]) > max) max = abs((float)data_h[0]);
-            if (abs((float)data_h[1]) > max) max = abs((float)data_h[1]);
-            if (abs((float)data_h[2]) > max) max = abs((float)data_h[2]);
-            if (abs((float)data_h[3]) > max) max = abs((float)data_h[3]);
-
-            group_index += blockDim.x;
-            reg_count++;
-        }
-
-#pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_xor(max, i);
-            if (max < temp) max = temp;
-        }
-        __shared__ float partialMax[WARP_SIZE];
-
-        if (lane == 0) partialMax[gid] = max;
-
-        b.sync();
-
-        if (lane < warp_num) max = partialMax[lane];
-
-#pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_down(max, i);
-            if (max < temp) max = temp;
-        }
-
-        max = g.shfl(max, 0);
-
-        float q_scale = (1 << num_bits) / (2 * max + 1e-5);
-        float q_scale_inv = 1 / q_scale;
-        for (int i = 0; i < reg_count; i++) {
-            group_index = i * blockDim.x + id;
-            if (group_index < group_size) {
-                __half2* data_h = reinterpret_cast<__half2*>(&data[i]);
-                float2 q_data[2];
-                q_data[0] = __half22float2(data_h[0]);
-                q_data[1] = __half22float2(data_h[1]);
-
-                float2 q_data_int[2];
-
-                q_data_int[0].x = roundf(q_data[0].x * q_scale);
-                q_data_int[0].y = roundf(q_data[0].y * q_scale);
-                q_data_int[1].x = roundf(q_data[1].x * q_scale);
-                q_data_int[1].y = roundf(q_data[1].y * q_scale);
-
-                q_data_int[0].x *= q_scale_inv;
-                q_data_int[0].y *= q_scale_inv;
-                q_data_int[1].x *= q_scale_inv;
-                q_data_int[1].y *= q_scale_inv;
-
-                data_h[0] = __float22half2_rn(q_data_int[0]);
-                data_h[1] = __float22half2_rn(q_data_int[1]);
-
-                vals_cast[offset + group_index] = data[i];
-            }
-        }
-    }
-#endif
-}
-
-__global__ void quantize_kernel(float* vals, int group_size, int num_bits)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int gid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-    int id = threadIdx.x;
-
-    float4* vals_cast = reinterpret_cast<float4*>(vals);
-
-    float4 data[MAX_REG];
-
-    int bid = blockIdx.x;
-
-    int group_index = bid * group_size + id;
-    int reg_count = 0;
-
-    float max = -10000.0;
-
-    while (id < group_size && reg_count < MAX_REG) {
-        float4 data_reg = vals_cast[group_index];
-        data[reg_count] = data_reg;
-
-        if (abs(data_reg.x) > max) max = abs(data_reg.x);
-        if (abs(data_reg.y) > max) max = abs(data_reg.y);
-        if (abs(data_reg.z) > max) max = abs(data_reg.z);
-        if (abs(data_reg.w) > max) max = abs(data_reg.w);
-
-        group_index += blockDim.x;
-        id += blockDim.x;
-        reg_count++;
-    }
-    id = threadIdx.x;
-#pragma unroll
-    for (int i = 1; i < WARP_SIZE; i <<= 1) {
-        auto temp = g.shfl_xor(max, i);
-        if (max < temp) max = temp;
-    }
-    __shared__ float partialMax[WARP_SIZE];
-
-    if (lane == 0) partialMax[gid] = max;
-
-    b.sync();
-
-    if (lane < warp_num) max = partialMax[lane];
-
-    b.sync();
-
-#pragma unroll
-    for (int i = 1; i < warp_num; i <<= 1) {
-        auto temp = g.shfl_down(max, i);
-        if (max < temp) max = temp;
-    }
-
-    max = g.shfl(max, 0);
-
-    float q_scale = (1 << num_bits) / (2 * max + 1e-5);
-    float q_scale_inv = 1 / q_scale;
-    for (int i = 0; i < reg_count; i++) {
-        group_index = i * blockDim.x + id;
-        if (group_index < group_size) {
-            float4 q_data;
-            q_data = data[i];
-
-            float4 q_data_int;
-            q_data_int.x = roundf(q_data.x * q_scale);
-            q_data_int.y = roundf(q_data.y * q_scale);
-            q_data_int.w = roundf(q_data.w * q_scale);
-            q_data_int.z = roundf(q_data.z * q_scale);
-
-            q_data.x = q_data_int.x * q_scale_inv;
-            q_data.y = q_data_int.y * q_scale_inv;
-            q_data.w = q_data_int.w * q_scale_inv;
-            q_data.z = q_data_int.z * q_scale_inv;
-
-            vals_cast[group_index + bid * group_size] = q_data;
-        }
-    }
-}
-
-template <typename T>
-void launch_quantize_kernel(T* vals,
-                            int total_count,
-                            int group_num,
-                            int num_bits,
-                            hipStream_t stream)
-{
-    dim3 grid_dim(group_num);
-    dim3 block_dim(1024);
-
-   hipLaunchKernelGGL(( quantize_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        vals, (total_count / group_num) / 4, num_bits);
-}
-
-template void launch_quantize_kernel(float* vals,
-                                     int total_count,
-                                     int group_num,
-                                     int num_bits,
-                                     hipStream_t stream);
-template void launch_quantize_kernel(__half* vals,
-                                     int total_count,
-                                     int group_num,
-                                     int num_bits,
-                                     hipStream_t stream);
-
-__global__ void sr_quantize_kernel(__half* vals,
-                                   int token_size,
-                                   int token_num,
-                                   int num_bits,
-                                   std::pair<uint64_t, uint64_t> seed)
-{
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int gid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    float2* vals_cast = reinterpret_cast<float2*>(vals);
-
-    __half2 data_low[128];
-    __half2 data_high[128];
-
-    int bid = blockIdx.x;
-
-    hiprandStatePhilox4_32_10_t state;
-    hiprand_init(seed.first, idx, seed.second, &state);
-    unsigned int tid = threadIdx.x;
-    int reg_count = 0;
-    int offset = bid * token_size;
-    int group_index = bid * token_size + tid;
-
-    int total_count = token_size * token_num;
-    if (group_index < total_count) {
-        // float min = 10000.0;
-        float max = -10000.0;
-        while (tid < token_size) {
-            float2 data = vals_cast[offset + tid];
-            __half2* data_h = reinterpret_cast<__half2*>(&data);
-            data_low[reg_count] = data_h[0];
-            data_high[reg_count] = data_h[1];
-
-            float2 data_f[2];
-            data_f[0] = __half22float2(data_h[0]);
-            data_f[1] = __half22float2(data_h[1]);
-
-            if (abs((float)data_f[0].x) > max) max = abs((float)data_f[0].x);
-            if (abs((float)data_f[0].y) > max) max = abs((float)data_f[0].y);
-            if (abs((float)data_f[1].x) > max) max = abs((float)data_f[1].x);
-            if (abs((float)data_f[1].y) > max) max = abs((float)data_f[1].y);
-
-            tid += blockDim.x;
-            reg_count++;
-        }
-
-#pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_xor(max, i);
-            if (max < temp) max = temp;
-        }
-
-        __shared__ float partialMax[WARP_SIZE];
-
-        if (lane == 0) partialMax[gid] = max;
-
-        b.sync();
-
-        if (lane < warp_num) max = partialMax[lane];
-
-#pragma unroll
-        for (int i = 1; i < warp_num; i <<= 1) {
-            auto temp = g.shfl_down(max, i);
-            if (max < temp) max = temp;
-        }
-
-        max = g.shfl(max, 0);
-
-        float q_scale_val = (float)(1 << num_bits) / (max * 2 + 1e-5);
-        float high_q = (float)((1 << (num_bits - 1)) - 1);
-        float low_q = (float)(-((1 << (num_bits - 1))));
-
-        for (int i = 0; i < reg_count; i++) {
-            int token_index = i * blockDim.x + threadIdx.x;
-            if (token_index < token_size) {
-                float2 data_f[2];
-                data_f[0] = __half22float2(data_low[i]);
-                data_f[1] = __half22float2(data_high[i]);
-
-                float2 q_data_int[2];
-                q_data_int[0].x = (float)((int)(data_f[0].x * q_scale_val));
-                q_data_int[0].y = (float)((int)(data_f[0].y * q_scale_val));
-                q_data_int[1].x = (float)((int)(data_f[1].x * q_scale_val));
-                q_data_int[1].y = (float)((int)(data_f[1].y * q_scale_val));
-
-                // Stochastic rounding
-                float4 rand = hiprand_uniform4(&state);
-
-                float q_error[4];
-                q_error[0] = abs(data_f[0].x - (q_data_int[0].x / q_scale_val)) * q_scale_val;
-                q_error[1] = abs(data_f[0].y - (q_data_int[0].y / q_scale_val)) * q_scale_val;
-                q_error[2] = abs(data_f[1].x - (q_data_int[1].x / q_scale_val)) * q_scale_val;
-                q_error[3] = abs(data_f[1].y - (q_data_int[1].y / q_scale_val)) * q_scale_val;
-
-                q_data_int[0].x =
-                    (rand.x < q_error[0] && q_data_int[0].x > low_q && q_data_int[0].x < high_q)
-                        ? (q_data_int[0].x + (data_f[0].x > 0 ? 1 : -1))
-                        : q_data_int[0].x;
-                q_data_int[0].y =
-                    (rand.y < q_error[1] && q_data_int[0].y > low_q && q_data_int[0].y < high_q)
-                        ? (q_data_int[0].y + (data_f[0].y > 0 ? 1 : -1))
-                        : q_data_int[0].y;
-                q_data_int[1].x =
-                    (rand.w < q_error[2] && q_data_int[1].x > low_q && q_data_int[1].x < high_q)
-                        ? (q_data_int[1].x + (data_f[1].x > 0 ? 1 : -1))
-                        : q_data_int[1].x;
-                q_data_int[1].y =
-                    (rand.z < q_error[3] && q_data_int[1].y > low_q && q_data_int[1].y < high_q)
-                        ? (q_data_int[1].y + (data_f[1].y > 0 ? 1 : -1))
-                        : q_data_int[1].y;
-
-                data_f[0].x = q_data_int[0].x / q_scale_val;
-                data_f[0].y = q_data_int[0].y / q_scale_val;
-                data_f[1].x = q_data_int[1].x / q_scale_val;
-                data_f[1].y = q_data_int[1].y / q_scale_val;
-
-                float2 result;
-                __half2* result_h = reinterpret_cast<__half2*>(&result);
-                result_h[0] = __float22half2_rn(data_f[0]);
-                result_h[1] = __float22half2_rn(data_f[1]);
-
-                vals_cast[offset + token_index] = result;
-            }
-        }
-    }
-#endif
-}
-
-__global__ void sr_quantize_kernel(float* vals,
-                                   int token_size,
-                                   int token_num,
-                                   int num_bits,
-                                   std::pair<uint64_t, uint64_t> seed)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int gid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-    int id = threadIdx.x;
-
-    int idx = blockIdx.x * blockDim.x + id;
-
-    float4* vals_cast = reinterpret_cast<float4*>(vals);
-
-    float4 data[128];
-
-    int bid = blockIdx.x;
-    int tid = threadIdx.x;
-    hiprandStatePhilox4_32_10_t state;
-    hiprand_init(seed.first, idx, seed.second, &state);
-
-    int group_index = bid * token_size + threadIdx.x;
-    int reg_count = 0;
-    int total_count = token_size * token_num;
-    if (group_index < total_count) {
-        // float min = 10000.0;
-        float max = -10000.0;
-
-        while (tid < token_size) {
-            data[reg_count] = vals_cast[group_index];
-
-            if (abs(data[reg_count].x) > max) max = abs(data[reg_count].x);
-            if (abs(data[reg_count].y) > max) max = abs(data[reg_count].y);
-            if (abs(data[reg_count].z) > max) max = abs(data[reg_count].z);
-            if (abs(data[reg_count].w) > max) max = abs(data[reg_count].w);
-
-            group_index += blockDim.x;
-            tid += blockDim.x;
-            reg_count++;
-        }
-
-#pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_xor(max, i);
-            if (max < temp) max = temp;
-        }
-        __shared__ float partialMax[WARP_SIZE];
-
-        if (lane == 0) partialMax[gid] = max;
-
-        b.sync();
-
-        if (lane < warp_num) max = partialMax[lane];
-
-#pragma unroll
-        for (int i = 1; i < warp_num; i <<= 1) {
-            auto temp = g.shfl_down(max, i);
-            if (max < temp) max = temp;
-        }
-
-        max = g.shfl(max, 0);
-
-        float q_scale_val = (float)(1 << num_bits) / (max * 2 + 1e-5);
-        float high_q = (float)((1 << (num_bits - 1)) - 1);
-        float low_q = (float)(-((1 << (num_bits - 1))));
-
-        int offset = (bid)*token_size;
-        for (int i = 0; i < reg_count; i++) {
-            group_index = i * blockDim.x + threadIdx.x;
-            if (group_index < token_size) {
-                float4 q_data = data[i];
-
-                float4 q_data_int;
-                q_data_int.x = (float)((int)(q_data.x * q_scale_val));
-                q_data_int.y = (float)((int)(q_data.y * q_scale_val));
-                q_data_int.w = (float)((int)(q_data.w * q_scale_val));
-                q_data_int.z = (float)((int)(q_data.z * q_scale_val));
-
-                // Stochastic rounding
-                float4 rand = hiprand_uniform4(&state);
-
-                float q_error[4];
-                q_error[0] = abs(q_data.x - (q_data_int.x / q_scale_val)) * q_scale_val;
-                q_error[1] = abs(q_data.y - (q_data_int.y / q_scale_val)) * q_scale_val;
-                q_error[2] = abs(q_data.w - (q_data_int.w / q_scale_val)) * q_scale_val;
-                q_error[3] = abs(q_data.z - (q_data_int.z / q_scale_val)) * q_scale_val;
-
-                q_data_int.x =
-                    (rand.x < q_error[0] && q_data_int.x > low_q && q_data_int.x < high_q)
-                        ? (q_data_int.x + (q_data.x > 0 ? 1 : -1))
-                        : q_data_int.x;
-                q_data_int.y =
-                    (rand.y < q_error[1] && q_data_int.y > low_q && q_data_int.y < high_q)
-                        ? (q_data_int.y + (q_data.y > 0 ? 1 : -1))
-                        : q_data_int.y;
-                q_data_int.w =
-                    (rand.w < q_error[2] && q_data_int.w > low_q && q_data_int.w < high_q)
-                        ? (q_data_int.w + (q_data.w > 0 ? 1 : -1))
-                        : q_data_int.w;
-                q_data_int.z =
-                    (rand.z < q_error[3] && q_data_int.z > low_q && q_data_int.z < high_q)
-                        ? (q_data_int.z + (q_data.z > 0 ? 1 : -1))
-                        : q_data_int.z;
-
-                q_data_int.x /= q_scale_val;
-                q_data_int.y /= q_scale_val;
-                q_data_int.w /= q_scale_val;
-                q_data_int.z /= q_scale_val;
-
-                vals_cast[group_index + offset] = q_data_int;
-            }
-        }
-    }
-}
-
-template <typename T>
-void launch_sr_quantize_kernel(T* vals,
-                               int total_count,
-                               int group_num,
-                               int num_bits,
-                               hipStream_t stream)
-{
-    dim3 block_dim(1024);
-    dim3 grid_dim(group_num);
-
-    uint64_t inc = total_count / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
-
-   hipLaunchKernelGGL(( sr_quantize_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        vals, (total_count / group_num) / 4, group_num, num_bits, seed);
-}
-template void launch_sr_quantize_kernel(float* vals,
-                                        int total_count,
-                                        int group_num,
-                                        int num_bits,
-                                        hipStream_t stream);
-template void launch_sr_quantize_kernel(__half* vals,
-                                        int total_count,
-                                        int group_num,
-                                        int num_bits,
-                                        hipStream_t stream);
-
-__global__ void quantize_kernel_asym(__half* vals, int group_size, int num_bits)
-{
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int gid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-    int id = threadIdx.x;
-
-    float2* vals_cast = reinterpret_cast<float2*>(vals);
-
-    float2 data[MAX_REG];
-
-    int group_id = blockIdx.x;
-
-    {
-        int group_index = id;
-        int reg_count = 0;
-        int offset = group_id * group_size;
-        float max = -10000.0;
-        float min = 10000.0;
-
-        while (group_index < group_size && reg_count < MAX_REG) {
-            data[reg_count] = vals_cast[offset + group_index];
-            __half* data_h = reinterpret_cast<__half*>(&data[reg_count]);
-
-            if (((float)data_h[0]) > max) max = (float)data_h[0];
-            if (((float)data_h[1]) > max) max = (float)data_h[1];
-            if (((float)data_h[2]) > max) max = (float)data_h[2];
-            if (((float)data_h[3]) > max) max = (float)data_h[3];
-
-            if (((float)data_h[0]) < min) min = (float)data_h[0];
-            if (((float)data_h[1]) < min) min = (float)data_h[1];
-            if (((float)data_h[2]) < min) min = (float)data_h[2];
-            if (((float)data_h[3]) < min) min = (float)data_h[3];
-
-            group_index += blockDim.x;
-            reg_count++;
-        }
-
-#pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_xor(max, i);
-            if (max < temp) max = temp;
-        }
-
-#pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_xor(min, i);
-            if (min > temp) min = temp;
-        }
-
-        __shared__ float partialMax[WARP_SIZE];
-        __shared__ float partialMin[WARP_SIZE];
-
-        if (lane == 0) partialMax[gid] = max;
-        if (lane == 0) partialMin[gid] = min;
-
-        b.sync();
-
-        if (lane < warp_num) max = partialMax[lane];
-        if (lane < warp_num) min = partialMin[lane];
-
-#pragma unroll
-        for (int i = 1; i < warp_num; i <<= 1) {
-            auto temp = g.shfl_down(max, i);
-            if (max < temp) max = temp;
-        }
-#pragma unroll
-        for (int i = 1; i < warp_num; i <<= 1) {
-            auto temp = g.shfl_down(min, i);
-            if (min > temp) min = temp;
-        }
-
-        max = g.shfl(max, 0);
-        min = g.shfl(min, 0);
-
-        float q_scale = ((max - min) + 1e-5) / (float)(1 << num_bits);
-        float q_scale_inv = 1 / q_scale;
-
-        for (int i = 0; i < reg_count; i++) {
-            group_index = i * blockDim.x + id;
-            if (group_index < group_size) {
-                __half2* data_h = reinterpret_cast<__half2*>(&data[i]);
-                float2 q_data[2];
-                q_data[0] = __half22float2(data_h[0]);
-                q_data[1] = __half22float2(data_h[1]);
-
-                float2 q_data_int[2];
-
-                q_data_int[0].x = roundf((q_data[0].x - min) * q_scale_inv);
-                q_data_int[0].y = roundf((q_data[0].y - min) * q_scale_inv);
-                q_data_int[1].x = roundf((q_data[1].x - min) * q_scale_inv);
-                q_data_int[1].y = roundf((q_data[1].y - min) * q_scale_inv);
-
-                q_data_int[0].x = q_data_int[0].x * q_scale + min;
-                q_data_int[0].y = q_data_int[0].y * q_scale + min;
-                q_data_int[1].x = q_data_int[1].x * q_scale + min;
-                q_data_int[1].y = q_data_int[1].y * q_scale + min;
-
-                data_h[0] = __float22half2_rn(q_data_int[0]);
-                data_h[1] = __float22half2_rn(q_data_int[1]);
-
-                vals_cast[offset + group_index] = data[i];
-            }
-        }
-    }
-#endif
-}
-
-__global__ void quantize_kernel_asym(float* vals, int group_size, int num_bits)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int gid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-    int id = threadIdx.x;
-
-    float4* vals_cast = reinterpret_cast<float4*>(vals);
-
-    float4 data[MAX_REG];
-
-    int bid = blockIdx.x;
-
-    int group_index = bid * group_size + id;
-    int reg_count = 0;
-
-    float max = -10000.0;
-    float min = 10000.0;
-
-    while (id < group_size && reg_count < MAX_REG) {
-        float4 data_reg = vals_cast[group_index];
-        data[reg_count] = data_reg;
-
-        if (data_reg.x > max) max = data_reg.x;
-        if (data_reg.y > max) max = data_reg.y;
-        if (data_reg.w > max) max = data_reg.w;
-        if (data_reg.z > max) max = data_reg.z;
-
-        if (data_reg.x < min) min = data_reg.x;
-        if (data_reg.y < min) min = data_reg.y;
-        if (data_reg.w < min) min = data_reg.w;
-        if (data_reg.z < min) min = data_reg.z;
-
-        group_index += blockDim.x;
-        id += blockDim.x;
-        reg_count++;
-    }
-    id = threadIdx.x;
-
-#pragma unroll
-    for (int i = 1; i < WARP_SIZE; i <<= 1) {
-        auto temp = g.shfl_xor(max, i);
-        if (max < temp) max = temp;
-    }
-
-#pragma unroll
-    for (int i = 1; i < WARP_SIZE; i <<= 1) {
-        auto temp = g.shfl_xor(min, i);
-        if (min > temp) min = temp;
-    }
-
-    __shared__ float partialMax[WARP_SIZE];
-    __shared__ float partialMin[WARP_SIZE];
-
-    if (lane == 0) partialMax[gid] = max;
-    if (lane == 0) partialMin[gid] = min;
-
-    b.sync();
-
-    if (lane < warp_num) max = partialMax[lane];
-    if (lane < warp_num) min = partialMin[lane];
-
-#pragma unroll
-    for (int i = 1; i < warp_num; i <<= 1) {
-        auto temp = g.shfl_down(max, i);
-        if (max < temp) max = temp;
-    }
-#pragma unroll
-    for (int i = 1; i < warp_num; i <<= 1) {
-        auto temp = g.shfl_down(min, i);
-        if (min > temp) min = temp;
-    }
-
-    max = g.shfl(max, 0);
-    min = g.shfl(min, 0);
-
-    float q_scale = ((max - min) + 1e-5) / (float)(1 << num_bits);
-    float q_scale_inv = 1 / q_scale;
-    for (int i = 0; i < reg_count; i++) {
-        group_index = i * blockDim.x + id;
-        if (group_index < group_size) {
-            float4 q_data;
-            q_data = data[i];
-
-            float4 q_data_int;
-            q_data_int.x = roundf((q_data.x - min) * q_scale_inv);
-            q_data_int.y = roundf((q_data.y - min) * q_scale_inv);
-            q_data_int.w = roundf((q_data.w - min) * q_scale_inv);
-            q_data_int.z = roundf((q_data.z - min) * q_scale_inv);
-
-            q_data.x = q_data_int.x * q_scale + min;
-            q_data.y = q_data_int.y * q_scale + min;
-            q_data.w = q_data_int.w * q_scale + min;
-            q_data.z = q_data_int.z * q_scale + min;
-
-            vals_cast[group_index + bid * group_size] = q_data;
-        }
-    }
-}
-
-template <typename T>
-void launch_quantize_kernel_asym(T* vals,
-                                 int total_count,
-                                 int group_num,
-                                 int num_bits,
-                                 hipStream_t stream)
-{
-    dim3 grid_dim(group_num);
-    dim3 block_dim(1024);
-
-   hipLaunchKernelGGL(( quantize_kernel_asym), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        vals, (total_count / group_num) / 4, num_bits);
-}
-
-template void launch_quantize_kernel_asym(float* vals,
-                                          int total_count,
-                                          int group_num,
-                                          int num_bits,
-                                          hipStream_t stream);
-template void launch_quantize_kernel_asym(__half* vals,
-                                          int total_count,
-                                          int group_num,
-                                          int num_bits,
-                                          hipStream_t stream);
-
-__global__ void sr_quantize_kernel_asym(__half* vals,
-                                        int token_size,
-                                        int token_num,
-                                        int num_bits,
-                                        std::pair<uint64_t, uint64_t> seed)
-{
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int gid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    float2* vals_cast = reinterpret_cast<float2*>(vals);
-
-    __half2 data_low[128];
-    __half2 data_high[128];
-
-    int bid = blockIdx.x;
-
-    hiprandStatePhilox4_32_10_t state;
-    hiprand_init(seed.first, idx, seed.second, &state);
-    unsigned int tid = threadIdx.x;
-    int reg_count = 0;
-    int offset = bid * token_size;
-    int group_index = bid * token_size + tid;
-
-    int total_count = token_size * token_num;
-    if (group_index < total_count) {
-        float min = 10000.0;
-        float max = -10000.0;
-        while (tid < token_size) {
-            float2 data = vals_cast[offset + tid];
-            __half2* data_h = reinterpret_cast<__half2*>(&data);
-            data_low[reg_count] = data_h[0];
-            data_high[reg_count] = data_h[1];
-
-            float2 data_f[2];
-            data_f[0] = __half22float2(data_h[0]);
-            data_f[1] = __half22float2(data_h[1]);
-
-            if (((float)data_f[0].x) > max) max = (float)data_f[0].x;
-            if (((float)data_f[0].y) > max) max = (float)data_f[0].y;
-            if (((float)data_f[1].x) > max) max = (float)data_f[1].x;
-            if (((float)data_f[1].y) > max) max = (float)data_f[1].y;
-
-            if (((float)data_f[0].x) < min) min = (float)data_f[0].x;
-            if (((float)data_f[0].y) < min) min = (float)data_f[0].y;
-            if (((float)data_f[1].x) < min) min = (float)data_f[1].x;
-            if (((float)data_f[1].y) < min) min = (float)data_f[1].y;
-
-            tid += blockDim.x;
-            reg_count++;
-        }
-
-#pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_xor(max, i);
-            if (max < temp) max = temp;
-        }
-
-#pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_xor(min, i);
-            if (min > temp) min = temp;
-        }
-
-        __shared__ float partialMax[WARP_SIZE];
-        __shared__ float partialMin[WARP_SIZE];
-
-        if (lane == 0) partialMax[gid] = max;
-        if (lane == 0) partialMin[gid] = min;
-
-        b.sync();
-
-        if (lane < warp_num) max = partialMax[lane];
-        if (lane < warp_num) min = partialMin[lane];
-
-#pragma unroll
-        for (int i = 1; i < warp_num; i <<= 1) {
-            auto temp = g.shfl_down(max, i);
-            if (max < temp) max = temp;
-        }
-#pragma unroll
-        for (int i = 1; i < warp_num; i <<= 1) {
-            auto temp = g.shfl_down(min, i);
-            if (min > temp) min = temp;
-        }
-
-        max = g.shfl(max, 0);
-        min = g.shfl(min, 0);
-
-        float q_scale_val = ((max - min) + 1e-5) / (float)(1 << num_bits);
-        float q_scale_val_inv = 1 / q_scale_val;
-        float high_q = (float)((1 << num_bits) - 1);
-
-        for (int i = 0; i < reg_count; i++) {
-            int token_index = i * blockDim.x + threadIdx.x;
-            if (token_index < token_size) {
-                float2 data_f[2];
-                data_f[0] = __half22float2(data_low[i]);
-                data_f[1] = __half22float2(data_high[i]);
-
-                float2 q_data_int[2];
-                q_data_int[0].x = (float)((unsigned int)((data_f[0].x - min) * q_scale_val_inv));
-                q_data_int[0].y = (float)((unsigned int)((data_f[0].y - min) * q_scale_val_inv));
-                q_data_int[1].x = (float)((unsigned int)((data_f[1].x - min) * q_scale_val_inv));
-                q_data_int[1].y = (float)((unsigned int)((data_f[1].y - min) * q_scale_val_inv));
-
-                // Stochastic rounding
-                float4 rand = hiprand_uniform4(&state);
-
-                float q_error[4];
-                q_error[0] =
-                    abs(data_f[0].x - ((q_data_int[0].x * q_scale_val) + min)) * q_scale_val_inv;
-                q_error[1] =
-                    abs(data_f[0].y - ((q_data_int[0].y * q_scale_val) + min)) * q_scale_val_inv;
-                q_error[2] =
-                    abs(data_f[1].x - ((q_data_int[1].x * q_scale_val) + min)) * q_scale_val_inv;
-                q_error[3] =
-                    abs(data_f[1].y - ((q_data_int[1].y * q_scale_val) + min)) * q_scale_val_inv;
-
-                q_data_int[0].x = (rand.x < q_error[0] && q_data_int[0].x < high_q)
-                                      ? (q_data_int[0].x + 1)
-                                      : q_data_int[0].x;
-                q_data_int[0].y = (rand.y < q_error[1] && q_data_int[0].y < high_q)
-                                      ? (q_data_int[0].y + 1)
-                                      : q_data_int[0].y;
-                q_data_int[1].x = (rand.w < q_error[2] && q_data_int[1].x < high_q)
-                                      ? (q_data_int[1].x + 1)
-                                      : q_data_int[1].x;
-                q_data_int[1].y = (rand.z < q_error[3] && q_data_int[1].y < high_q)
-                                      ? (q_data_int[1].y + 1)
-                                      : q_data_int[1].y;
-
-                data_f[0].x = q_data_int[0].x * q_scale_val + min;
-                data_f[0].y = q_data_int[0].y * q_scale_val + min;
-                data_f[1].x = q_data_int[1].x * q_scale_val + min;
-                data_f[1].y = q_data_int[1].y * q_scale_val + min;
-
-                float2 result;
-                __half2* result_h = reinterpret_cast<__half2*>(&result);
-                result_h[0] = __float22half2_rn(data_f[0]);
-                result_h[1] = __float22half2_rn(data_f[1]);
-
-                vals_cast[offset + token_index] = result;
-            }
-        }
-    }
-#endif
-}
-
-__global__ void sr_quantize_kernel_asym(float* vals,
-                                        int token_size,
-                                        int token_num,
-                                        int num_bits,
-                                        std::pair<uint64_t, uint64_t> seed)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int gid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-    int id = threadIdx.x;
-
-    int idx = blockIdx.x * blockDim.x + id;
-
-    float4* vals_cast = reinterpret_cast<float4*>(vals);
-
-    float4 data[128];
-
-    int bid = blockIdx.x;
-    int tid = threadIdx.x;
-    hiprandStatePhilox4_32_10_t state;
-    hiprand_init(seed.first, idx, seed.second, &state);
-
-    int group_index = bid * token_size + threadIdx.x;
-    int reg_count = 0;
-    int total_count = token_size * token_num;
-    if (group_index < total_count) {
-        float min = 10000.0;
-        float max = -10000.0;
-
-        while (tid < token_size) {
-            float4 data_reg = vals_cast[group_index];
-            data[reg_count] = data_reg;
-            if (data_reg.x > max) max = data_reg.x;
-            if (data_reg.y > max) max = data_reg.y;
-            if (data_reg.w > max) max = data_reg.w;
-            if (data_reg.z > max) max = data_reg.z;
-
-            if (data_reg.x < min) min = data_reg.x;
-            if (data_reg.y < min) min = data_reg.y;
-            if (data_reg.w < min) min = data_reg.w;
-            if (data_reg.z < min) min = data_reg.z;
-
-            group_index += blockDim.x;
-            tid += blockDim.x;
-            reg_count++;
-        }
-
-#pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_xor(max, i);
-            if (max < temp) max = temp;
-        }
-
-#pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_xor(min, i);
-            if (min > temp) min = temp;
-        }
-
-        __shared__ float partialMax[WARP_SIZE];
-        __shared__ float partialMin[WARP_SIZE];
-
-        if (lane == 0) partialMax[gid] = max;
-        if (lane == 0) partialMin[gid] = min;
-
-        b.sync();
-
-        if (lane < warp_num) max = partialMax[lane];
-        if (lane < warp_num) min = partialMin[lane];
-
-#pragma unroll
-        for (int i = 1; i < warp_num; i <<= 1) {
-            auto temp = g.shfl_down(max, i);
-            if (max < temp) max = temp;
-        }
-#pragma unroll
-        for (int i = 1; i < warp_num; i <<= 1) {
-            auto temp = g.shfl_down(min, i);
-            if (min > temp) min = temp;
-        }
-
-        max = g.shfl(max, 0);
-        min = g.shfl(min, 0);
-
-        float q_scale_val = ((max - min) + 1e-5) / (float)(1 << num_bits);
-        float high_q = (float)((1 << num_bits) - 1);
-
-        int offset = (bid)*token_size;
-        for (int i = 0; i < reg_count; i++) {
-            group_index = i * blockDim.x + threadIdx.x;
-            if (group_index < token_size) {
-                float4 q_data = data[i];
-
-                float4 q_data_int;
-                q_data_int.x = (float)((int)((q_data.x - min) / q_scale_val));
-                q_data_int.y = (float)((int)((q_data.y - min) / q_scale_val));
-                q_data_int.w = (float)((int)((q_data.w - min) / q_scale_val));
-                q_data_int.z = (float)((int)((q_data.z - min) / q_scale_val));
-
-                // Stochastic rounding
-                float4 rand = hiprand_uniform4(&state);
-
-                float q_error[4];
-                q_error[0] = abs(q_data.x - ((q_data_int.x * q_scale_val) + min)) / q_scale_val;
-                q_error[1] = abs(q_data.y - ((q_data_int.y * q_scale_val) + min)) / q_scale_val;
-                q_error[2] = abs(q_data.w - ((q_data_int.w * q_scale_val) + min)) / q_scale_val;
-                q_error[3] = abs(q_data.z - ((q_data_int.z * q_scale_val) + min)) / q_scale_val;
-
-                q_data_int.x = (rand.x < q_error[0] && q_data_int.x < high_q) ? (q_data_int.x + 1)
-                                                                              : q_data_int.x;
-                q_data_int.y = (rand.y < q_error[1] && q_data_int.y < high_q) ? (q_data_int.y + 1)
-                                                                              : q_data_int.y;
-                q_data_int.w = (rand.w < q_error[2] && q_data_int.w < high_q) ? (q_data_int.w + 1)
-                                                                              : q_data_int.w;
-                q_data_int.z = (rand.z < q_error[3] && q_data_int.z < high_q) ? (q_data_int.z + 1)
-                                                                              : q_data_int.z;
-
-                q_data_int.x = q_data_int.x * q_scale_val + min;
-                q_data_int.y = q_data_int.y * q_scale_val + min;
-                q_data_int.w = q_data_int.w * q_scale_val + min;
-                q_data_int.z = q_data_int.z * q_scale_val + min;
-
-                vals_cast[group_index + offset] = q_data_int;
-            }
-        }
-    }
-}
-template <typename T>
-void launch_sr_quantize_kernel_asym(T* vals,
-                                    int total_count,
-                                    int group_num,
-                                    int num_bits,
-                                    hipStream_t stream)
-{
-    dim3 block_dim(1024);
-    dim3 grid_dim(group_num);
-
-    uint64_t inc = total_count / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
-
-   hipLaunchKernelGGL(( sr_quantize_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        vals, (total_count / group_num) / 4, group_num, num_bits, seed);
-}
-template void launch_sr_quantize_kernel_asym(float* vals,
-                                             int total_count,
-                                             int group_num,
-                                             int num_bits,
-                                             hipStream_t stream);
-template void launch_sr_quantize_kernel_asym(__half* vals,
-                                             int total_count,
-                                             int group_num,
-                                             int num_bits,
-                                             hipStream_t stream);
diff --git a/deepspeed/ops/csrc/sparse_attention/utils.cpp b/deepspeed/ops/csrc/sparse_attention/utils.cpp
deleted file mode 100644
index 8e4346be8a299a09d38ce22adf1c2f80385620c1..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/sparse_attention/utils.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-// DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
-// https://github.com/ptillet/torch-blocksparse/blob/master/csrc/utils.cpp
-
-#include <torch/extension.h>
-#include <string>
-#include <tuple>
-#include <vector>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-typedef std::vector<std::tuple<int, torch::Tensor>> ret_t;
-
-void segment_blocks(torch::Tensor layout,
-                    torch::Tensor idx,
-                    torch::Tensor scratch,
-                    int max_width,
-                    ret_t& ret)
-{
-    size_t H = layout.size(0);
-    size_t M = layout.size(1);
-    size_t N = layout.size(2);
-    torch::Tensor tmp = torch::zeros_like(layout);
-
-    auto _tmp = tmp.accessor<int, 3>();
-    auto _layout = layout.accessor<int, 3>();
-    auto _idx = idx.accessor<int, 3>();
-    auto _scratch = scratch.accessor<int, 3>();
-    std::vector<int> current(H, 0);
-
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-    for (size_t h = 0; h < H; h++) {
-        // surrounding indices
-        std::vector<int> ii_left(max_width, -1);
-        std::vector<std::vector<int>> ii_top(max_width, std::vector<int>(N, -1));
-
-        for (size_t m = 0; m < M; m++) {
-            for (size_t n = 0; n < N; n++) {
-                int v = _layout[h][m][n];
-                if (v == 0) continue;
-                int n_left = ii_left[max_width - 1];
-                int m_top = ii_top[max_width - 1][n];
-                int top = (m_top >= 0) ? _tmp[h][m_top][n] : 0;
-                int left = (n_left >= 0) ? _tmp[h][m][n_left] : 0;
-                int topleft = (m_top >= 0 && n_left >= 0) ? _tmp[h][m_top][n_left] : 0;
-                int width = std::min(left, std::min(top, topleft)) + 1;
-
-                // reset width if blocks cannot be
-                // packed together (i.e., there's a 1 "in the middle")
-                for (int nn = n_left + 1; nn < n; nn++)
-                    if (ii_top[max_width - 1][nn] > ii_top[max_width - 1][n]) width = 1;
-                _tmp[h][m][n] = width;
-
-                // update n_left ring buffer
-                for (int k = 0; k < max_width - 1; k++) ii_left[k] = ii_left[k + 1];
-                ii_left[max_width - 1] = n;
-
-                // update ii_top ring buffer
-                for (int k = 0; k < max_width - 1; k++) ii_top[k][n] = ii_top[k + 1][n];
-                ii_top[max_width - 1][n] = m;
-
-                // block is too small -- skip
-                if (width != max_width) continue;
-
-                // retained blocks are set to zeros
-                for (size_t km = 0; km < max_width; km++)
-                    for (size_t kn = 0; kn < max_width; kn++) {
-                        int mm = ii_top[km][n];
-                        int nn = ii_left[kn];
-                        if (mm < 0 || nn < 0) continue;
-                        _layout[h][mm][nn] = 0;
-                        _tmp[h][mm][nn] = 0;
-                        _scratch[h][current[h]][0] = (int)h;
-                        _scratch[h][current[h]][1] = (int)mm;
-                        _scratch[h][current[h]][2] = (int)nn;
-                        _scratch[h][current[h]][3] = _idx[h][mm][nn];
-                        current[h]++;
-                    }
-            }
-        }
-    }
-    std::vector<torch::Tensor> to_cat;
-    for (size_t h = 0; h < H; h++)
-        if (current[h] > 0) to_cat.push_back(scratch[h].slice(0, 0, current[h]));
-    if (!to_cat.empty()) ret.push_back({max_width, torch::cat(to_cat)});
-}
-
-ret_t sdd_segment(torch::Tensor layout, int start_width)
-{
-    ret_t ret;
-
-    // block index
-    torch::Tensor idx = torch::zeros_like(layout);
-    int current = 0;
-    int64_t H = layout.size(0);
-    int64_t M = layout.size(1);
-    int64_t N = layout.size(2);
-    auto _layout = layout.accessor<int, 3>();
-    auto _idx = idx.accessor<int, 3>();
-    for (int64_t h = 0; h < H; h++)
-        for (int64_t m = 0; m < M; m++)
-            for (int64_t n = 0; n < N; n++) {
-                if (_layout[h][m][n] == 0) continue;
-                _idx[h][m][n] = current++;
-            }
-
-    // scratch memory
-    torch::Tensor scratch = torch::empty({H, layout.sum().item<int>(), 4}, layout.dtype());
-
-    for (int max_width = start_width; max_width > 0; max_width /= 2)
-        segment_blocks(layout, idx, scratch, max_width, ret);
-    return ret;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("sdd_segment", &sdd_segment, "SDD segmentation handler");
-}
diff --git a/deepspeed/ops/csrc/transformer/cublas_wrappers.cu b/deepspeed/ops/csrc/transformer/cublas_wrappers.cu
deleted file mode 100644
index 75ecd3fb4ef9d5d63d9c7681bdce0cf949641b5d..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/cublas_wrappers.cu
+++ /dev/null
@@ -1,403 +0,0 @@
-#include "cublas_wrappers.h"
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const float* A,
-                   const float* B,
-                   float* C,
-                   rocblas_gemm_algo algo)
-#else
-int cublas_gemm_ex(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const float* A,
-                   const float* B,
-                   float* C,
-                   cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status = rocblas_gemm_ex(handle,
-                                            transa,
-                                            transb,
-                                            m,
-                                            n,
-                                            k,
-                                            (const void*)alpha,
-                                            (const void*)A,
-                                            rocblas_datatype_f32_r,
-                                            (transa == rocblas_operation_none) ? m : k,
-                                            (const void*)B,
-                                            rocblas_datatype_f32_r,
-                                            (transb == rocblas_operation_none) ? k : n,
-                                            (const void*)beta,
-                                            C,
-                                            rocblas_datatype_f32_r,
-                                            m,
-                                            C,
-                                            rocblas_datatype_f32_r,
-                                            m,
-                                            rocblas_datatype_f32_r,
-                                            algo,
-                                            0,
-                                            0);
-#else
-    cublasStatus_t status = cublasGemmEx(handle,
-                                         transa,
-                                         transb,
-                                         m,
-                                         n,
-                                         k,
-                                         (const void*)alpha,
-                                         (const void*)A,
-                                         CUDA_R_32F,
-                                         (transa == CUBLAS_OP_N) ? m : k,
-                                         (const void*)B,
-                                         CUDA_R_32F,
-                                         (transb == CUBLAS_OP_N) ? k : n,
-                                         (const void*)beta,
-                                         C,
-                                         CUDA_R_32F,
-                                         m,
-                                         CUDA_R_32F,
-                                         algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != CUBLAS_STATUS_SUCCESS) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-                   rocblas_gemm_algo algo)
-#else
-int cublas_gemm_ex(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-                   cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status = rocblas_gemm_ex(handle,
-                                            transa,
-                                            transb,
-                                            m,
-                                            n,
-                                            k,
-                                            (const void*)alpha,
-                                            (const void*)A,
-                                            rocblas_datatype_f16_r,
-                                            (transa == rocblas_operation_none) ? m : k,
-                                            (const void*)B,
-                                            rocblas_datatype_f16_r,
-                                            (transb == rocblas_operation_none) ? k : n,
-                                            (const void*)beta,
-                                            (void*)C,
-                                            rocblas_datatype_f16_r,
-                                            m,
-                                            (void*)C,
-                                            rocblas_datatype_f16_r,
-                                            m,
-                                            rocblas_datatype_f32_r,
-                                            algo,
-                                            0,
-                                            0);
-#else
-    cublasStatus_t status = cublasGemmEx(handle,
-                                         transa,
-                                         transb,
-                                         m,
-                                         n,
-                                         k,
-                                         (const void*)alpha,
-                                         (const void*)A,
-                                         CUDA_R_16F,
-                                         (transa == CUBLAS_OP_N) ? m : k,
-                                         (const void*)B,
-                                         CUDA_R_16F,
-                                         (transb == CUBLAS_OP_N) ? k : n,
-                                         (const void*)beta,
-                                         (void*)C,
-                                         CUDA_R_16F,
-                                         m,
-                                         CUDA_R_32F,
-                                         algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != CUBLAS_STATUS_SUCCESS) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const float* A,
-                                const float* B,
-                                float* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                rocblas_gemm_algo algo)
-#else
-int cublas_strided_batched_gemm(cublasHandle_t handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const float* A,
-                                const float* B,
-                                float* C,
-                                cublasOperation_t op_A,
-                                cublasOperation_t op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status =
-        rocblas_gemm_strided_batched_ex(handle,
-                                        op_A,
-                                        op_B,
-                                        m,
-                                        n,
-                                        k,
-                                        alpha,
-                                        A,
-                                        rocblas_datatype_f32_r,
-                                        (op_A == rocblas_operation_none) ? m : k,
-                                        stride_A,
-                                        B,
-                                        rocblas_datatype_f32_r,
-                                        (op_B == rocblas_operation_none) ? k : n,
-                                        stride_B,
-                                        beta,
-                                        C,
-                                        rocblas_datatype_f32_r,
-                                        m,
-                                        stride_C,
-                                        C,
-                                        rocblas_datatype_f32_r,
-                                        m,
-                                        stride_C,
-                                        batch,
-                                        rocblas_datatype_f32_r,
-                                        algo,
-                                        0,
-                                        0);
-#else
-    cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
-                                                       op_A,
-                                                       op_B,
-                                                       m,
-                                                       n,
-                                                       k,
-                                                       alpha,
-                                                       A,
-                                                       CUDA_R_32F,
-                                                       (op_A == CUBLAS_OP_N) ? m : k,
-                                                       stride_A,
-                                                       B,
-                                                       CUDA_R_32F,
-                                                       (op_B == CUBLAS_OP_N) ? k : n,
-                                                       stride_B,
-                                                       beta,
-                                                       C,
-                                                       CUDA_R_32F,
-                                                       m,
-                                                       stride_C,
-                                                       batch,
-                                                       CUDA_R_32F,
-                                                       algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != CUBLAS_STATUS_SUCCESS) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d) \n",
-                batch,
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                rocblas_gemm_algo algo)
-#else
-int cublas_strided_batched_gemm(cublasHandle_t handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
-                                cublasOperation_t op_A,
-                                cublasOperation_t op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status =
-        rocblas_gemm_strided_batched_ex(handle,
-                                        op_A,
-                                        op_B,
-                                        m,
-                                        n,
-                                        k,
-                                        alpha,
-                                        A,
-                                        rocblas_datatype_f16_r,
-                                        (op_A == rocblas_operation_none) ? m : k,
-                                        stride_A,
-                                        B,
-                                        rocblas_datatype_f16_r,
-                                        (op_B == rocblas_operation_none) ? k : n,
-                                        stride_B,
-                                        beta,
-                                        C,
-                                        rocblas_datatype_f16_r,
-                                        m,
-                                        stride_C,
-                                        C,
-                                        rocblas_datatype_f16_r,
-                                        m,
-                                        stride_C,
-                                        batch,
-                                        rocblas_datatype_f32_r,
-                                        algo,
-                                        0,
-                                        0);
-#else
-    cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
-                                                       op_A,
-                                                       op_B,
-                                                       m,
-                                                       n,
-                                                       k,
-                                                       alpha,
-                                                       A,
-                                                       CUDA_R_16F,
-                                                       (op_A == CUBLAS_OP_N) ? m : k,
-                                                       stride_A,
-                                                       B,
-                                                       CUDA_R_16F,
-                                                       (op_B == CUBLAS_OP_N) ? k : n,
-                                                       stride_B,
-                                                       beta,
-                                                       C,
-                                                       CUDA_R_16F,
-                                                       m,
-                                                       stride_C,
-                                                       batch,
-                                                       CUDA_R_32F,
-                                                       algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != CUBLAS_STATUS_SUCCESS) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-
-    return 0;
-}
diff --git a/deepspeed/ops/csrc/transformer/cublas_wrappers.hip b/deepspeed/ops/csrc/transformer/cublas_wrappers.hip
deleted file mode 100644
index 04aa0ef0a7d083a50fc7d4ec8f01b24e2ccd52e8..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/cublas_wrappers.hip
+++ /dev/null
@@ -1,404 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "cublas_wrappers_hip.h"
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const float* A,
-                   const float* B,
-                   float* C,
-                   rocblas_gemm_algo algo)
-#else
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const float* A,
-                   const float* B,
-                   float* C,
-                   cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status = rocblas_gemm_ex(handle,
-                                            transa,
-                                            transb,
-                                            m,
-                                            n,
-                                            k,
-                                            (const void*)alpha,
-                                            (const void*)A,
-                                            rocblas_datatype_f32_r,
-                                            (transa == rocblas_operation_none) ? m : k,
-                                            (const void*)B,
-                                            rocblas_datatype_f32_r,
-                                            (transb == rocblas_operation_none) ? k : n,
-                                            (const void*)beta,
-                                            C,
-                                            rocblas_datatype_f32_r,
-                                            m,
-                                            C,
-                                            rocblas_datatype_f32_r,
-                                            m,
-                                            rocblas_datatype_f32_r,
-                                            algo,
-                                            0,
-                                            0);
-#else
-    rocblas_status status = rocblas_gemmex(handle,
-                                         transa,
-                                         transb,
-                                         m,
-                                         n,
-                                         k,
-                                         (const void*)alpha,
-                                         (const void*)A,
-                                         hipR32F,
-                                         (transa == rocblas_operation_none) ? m : k,
-                                         (const void*)B,
-                                         hipR32F,
-                                         (transb == rocblas_operation_none) ? k : n,
-                                         (const void*)beta,
-                                         C,
-                                         hipR32F,
-                                         m,
-                                         hipR32F,
-                                         algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != rocblas_status_success) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-                   rocblas_gemm_algo algo)
-#else
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-                   cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status = rocblas_gemm_ex(handle,
-                                            transa,
-                                            transb,
-                                            m,
-                                            n,
-                                            k,
-                                            (const void*)alpha,
-                                            (const void*)A,
-                                            rocblas_datatype_f16_r,
-                                            (transa == rocblas_operation_none) ? m : k,
-                                            (const void*)B,
-                                            rocblas_datatype_f16_r,
-                                            (transb == rocblas_operation_none) ? k : n,
-                                            (const void*)beta,
-                                            (void*)C,
-                                            rocblas_datatype_f16_r,
-                                            m,
-                                            (void*)C,
-                                            rocblas_datatype_f16_r,
-                                            m,
-                                            rocblas_datatype_f32_r,
-                                            algo,
-                                            0,
-                                            0);
-#else
-    rocblas_status status = rocblas_gemmex(handle,
-                                         transa,
-                                         transb,
-                                         m,
-                                         n,
-                                         k,
-                                         (const void*)alpha,
-                                         (const void*)A,
-                                         hipR16F,
-                                         (transa == rocblas_operation_none) ? m : k,
-                                         (const void*)B,
-                                         hipR16F,
-                                         (transb == rocblas_operation_none) ? k : n,
-                                         (const void*)beta,
-                                         (void*)C,
-                                         hipR16F,
-                                         m,
-                                         hipR32F,
-                                         algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != rocblas_status_success) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const float* A,
-                                const float* B,
-                                float* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                rocblas_gemm_algo algo)
-#else
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const float* A,
-                                const float* B,
-                                float* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status =
-        rocblas_gemm_strided_batched_ex(handle,
-                                        op_A,
-                                        op_B,
-                                        m,
-                                        n,
-                                        k,
-                                        alpha,
-                                        A,
-                                        rocblas_datatype_f32_r,
-                                        (op_A == rocblas_operation_none) ? m : k,
-                                        stride_A,
-                                        B,
-                                        rocblas_datatype_f32_r,
-                                        (op_B == rocblas_operation_none) ? k : n,
-                                        stride_B,
-                                        beta,
-                                        C,
-                                        rocblas_datatype_f32_r,
-                                        m,
-                                        stride_C,
-                                        C,
-                                        rocblas_datatype_f32_r,
-                                        m,
-                                        stride_C,
-                                        batch,
-                                        rocblas_datatype_f32_r,
-                                        algo,
-                                        0,
-                                        0);
-#else
-    rocblas_status status = cublasGemmStridedBatchedEx(handle,
-                                                       op_A,
-                                                       op_B,
-                                                       m,
-                                                       n,
-                                                       k,
-                                                       alpha,
-                                                       A,
-                                                       hipR32F,
-                                                       (op_A == rocblas_operation_none) ? m : k,
-                                                       stride_A,
-                                                       B,
-                                                       hipR32F,
-                                                       (op_B == rocblas_operation_none) ? k : n,
-                                                       stride_B,
-                                                       beta,
-                                                       C,
-                                                       hipR32F,
-                                                       m,
-                                                       stride_C,
-                                                       batch,
-                                                       hipR32F,
-                                                       algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != rocblas_status_success) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d) \n",
-                batch,
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                rocblas_gemm_algo algo)
-#else
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status =
-        rocblas_gemm_strided_batched_ex(handle,
-                                        op_A,
-                                        op_B,
-                                        m,
-                                        n,
-                                        k,
-                                        alpha,
-                                        A,
-                                        rocblas_datatype_f16_r,
-                                        (op_A == rocblas_operation_none) ? m : k,
-                                        stride_A,
-                                        B,
-                                        rocblas_datatype_f16_r,
-                                        (op_B == rocblas_operation_none) ? k : n,
-                                        stride_B,
-                                        beta,
-                                        C,
-                                        rocblas_datatype_f16_r,
-                                        m,
-                                        stride_C,
-                                        C,
-                                        rocblas_datatype_f16_r,
-                                        m,
-                                        stride_C,
-                                        batch,
-                                        rocblas_datatype_f32_r,
-                                        algo,
-                                        0,
-                                        0);
-#else
-    rocblas_status status = cublasGemmStridedBatchedEx(handle,
-                                                       op_A,
-                                                       op_B,
-                                                       m,
-                                                       n,
-                                                       k,
-                                                       alpha,
-                                                       A,
-                                                       hipR16F,
-                                                       (op_A == rocblas_operation_none) ? m : k,
-                                                       stride_A,
-                                                       B,
-                                                       hipR16F,
-                                                       (op_B == rocblas_operation_none) ? k : n,
-                                                       stride_B,
-                                                       beta,
-                                                       C,
-                                                       hipR16F,
-                                                       m,
-                                                       stride_C,
-                                                       batch,
-                                                       hipR32F,
-                                                       algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != rocblas_status_success) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-
-    return 0;
-}
diff --git a/deepspeed/ops/csrc/transformer/dropout_kernels.cu b/deepspeed/ops/csrc/transformer/dropout_kernels.cu
deleted file mode 100644
index d1ba135f4900f8eff3f6b4cab70d1b35b39f7833..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/dropout_kernels.cu
+++ /dev/null
@@ -1,868 +0,0 @@
-#include "custom_cuda_layers.h"
-
-const int unroll_factor = 4;
-
-__global__ void dropout_kernel(const int N,
-                               const float ratio,
-                               float* out,
-                               const float* Xdata,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    curandStatePhilox4_32_10_t state;
-    curand_init(seed.first, idx, seed.second, &state);
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        float4 rand = curand_uniform4(&state);
-        uint8_t m[unroll_factor];
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        int i = j * unroll_factor;
-
-        mask[i] = (uint8_t)m[0];
-        mask[i + 1] = (uint8_t)m[1];
-        mask[i + 2] = (uint8_t)m[2];
-        mask[i + 3] = (uint8_t)m[3];
-
-        out[i] = Xdata[i] * scale * m[0];
-        out[i + 1] = Xdata[i + 1] * scale * m[1];
-        out[i + 2] = Xdata[i + 2] * scale * m[2];
-        out[i + 3] = Xdata[i + 3] * scale * m[3];
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = curand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            out[i] = Xdata[i] * scale * m;
-            mask[i] = m;
-        }
-    }
-}
-
-__global__ void dropout_kernel(const int N,
-                               const float ratio,
-                               __half* out,
-                               const __half* Xdata,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    curandStatePhilox4_32_10_t state;
-    curand_init(seed.first, idx, seed.second, &state);
-
-#ifdef __STOCHASTIC_MODE__
-
-    const __half2 h_scale = __float2half2_rn(scale);
-    const float2* x_cast = reinterpret_cast<const float2*>(Xdata);
-    float2* out_cast = reinterpret_cast<float2*>(out);
-    uint32_t* mask_cast = reinterpret_cast<uint32_t*>(mask);
-
-    uint32_t m_32;
-    uint8_t* m = reinterpret_cast<uint8_t*>(&m_32);
-
-    float2 result_f;
-    __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-    __half2 mask_h[2];
-    float2 mask_f[2];
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        float2 x_f = x_cast[j];
-        __half2* x_h = reinterpret_cast<__half2*>(&x_f);
-
-        float4 rand = curand_uniform4(&state);
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        float* mask_f_data = &mask_f[0].x;
-#pragma unroll
-        for (int i = 0; i < unroll_factor; i++) mask_f_data[i] = (float)(m[i]);
-
-        mask_h[0] = __float22half2_rn(mask_f[0]);
-        mask_h[1] = __float22half2_rn(mask_f[1]);
-
-        result_h[0] = x_h[0] * h_scale * mask_h[0];
-        result_h[1] = x_h[1] * h_scale * mask_h[1];
-
-        out_cast[j] = result_f;
-
-        mask_cast[j] = m_32;
-    }
-
-#else
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        int i = j * unroll_factor;
-
-        const __half2* vals_half = reinterpret_cast<const __half2*>(Xdata + i);
-        float2 vals_half_f[2];
-        vals_half_f[0] = __half22float2(vals_half[0]);
-        vals_half_f[1] = __half22float2(vals_half[1]);
-
-        uint8_t m[unroll_factor];
-        float4 rand = curand_uniform4(&state);
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        out[i] = __float2half(vals_half_f[0].x * scale * m[0]);
-        out[i + 1] = __float2half(vals_half_f[0].y * scale * m[1]);
-        out[i + 2] = __float2half(vals_half_f[1].x * scale * m[2]);
-        out[i + 3] = __float2half(vals_half_f[1].y * scale * m[3]);
-
-        mask[i] = m[0];
-        mask[i + 1] = m[1];
-        mask[i + 2] = m[2];
-        mask[i + 3] = m[3];
-    }
-
-#endif
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = curand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            out[i] = __float2half((float)Xdata[i] * scale * m);
-            mask[i] = m;
-        }
-    }
-}
-
-__global__ void dropout_kernel_bwd(const int N,
-                                   const float ratio,
-                                   const float* Xdata,
-                                   float* out,
-                                   uint8_t* mask,
-                                   std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        int i = j * unroll_factor;
-
-        out[i] = mask[i] ? Xdata[i] * scale : 0.0;
-        out[i + 1] = mask[i + 1] ? Xdata[i + 1] * scale : 0.0;
-        out[i + 2] = mask[i + 2] ? Xdata[i + 2] * scale : 0.0;
-        out[i + 3] = mask[i + 3] ? Xdata[i + 3] * scale : 0.0;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        for (int i = high_index; i < N; i++) { out[i] = mask[i] ? Xdata[i] * scale : 0.0; }
-    }
-}
-
-__global__ void dropout_kernel_bwd(const int N,
-                                   const float ratio,
-                                   const __half* Xdata,
-                                   __half* out,
-                                   uint8_t* mask,
-                                   std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-
-#ifdef __STOCHASTIC_MODE__
-
-    const __half2 h_scale = __float2half2_rn(scale);
-
-    const float2* x_cast = reinterpret_cast<const float2*>(Xdata);
-    float2* out_cast = reinterpret_cast<float2*>(out);
-    uint32_t* mask_cast = reinterpret_cast<uint32_t*>(mask);
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        float2 x_f = x_cast[j];
-        __half2* x_h = reinterpret_cast<__half2*>(&x_f);
-
-        uint32_t m_32 = mask_cast[j];
-        uint8_t* m = (uint8_t*)&m_32;
-
-        __half2 mask_h[2];
-        float2 mask_f[2];
-
-        float* mask_f_data = &mask_f[0].x;
-#pragma unroll
-        for (int i = 0; i < unroll_factor; i++) mask_f_data[i] = (float)(m[i]);
-
-#pragma unroll
-        for (int i = 0; i < 2; i++) mask_h[i] = __float22half2_rn(mask_f[i]);
-
-        float2 result_f;
-        __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-        result_h[0] = x_h[0] * h_scale * mask_h[0];
-        result_h[1] = x_h[1] * h_scale * mask_h[1];
-
-        out_cast[j] = result_f;
-    }
-
-#else
-
-    const __half h_scale = __float2half(scale);
-    const __half h_zero = __float2half(0.0);
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        int i = j * unroll_factor;
-
-        const __half2* vals_half = reinterpret_cast<const __half2*>(Xdata + i);
-
-        uint8_t* m = mask + i;
-
-        float2 vals_half_f[2];
-
-        vals_half_f[0] = __half22float2(vals_half[0]);
-        vals_half_f[1] = __half22float2(vals_half[1]);
-
-        out[i] = __float2half(vals_half_f[0].x * scale * m[0]);
-        out[i + 1] = __float2half(vals_half_f[0].y * scale * m[1]);
-        out[i + 2] = __float2half(vals_half_f[1].x * scale * m[2]);
-        out[i + 3] = __float2half(vals_half_f[1].y * scale * m[3]);
-    }
-
-#endif
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        for (int i = high_index; i < N; i++) {
-            out[i] = __float2half((float)Xdata[i] * scale * mask[i]);
-        }
-    }
-}
-
-template <typename T>
-void launch_dropout(T* out,
-                    const T* vals,
-                    uint8_t* mask,
-                    int total_count,
-                    int dim,
-                    float ratio,
-                    cudaStream_t stream,
-                    bool bwd)
-{
-    assert(unroll_factor == 4);
-
-    dim3 grid_dim = DS_GET_BLOCKS(total_count / unroll_factor);
-    dim3 block_dim = DS_CUDA_NUM_THREADS;
-
-    if (dim > 512) {
-        block_dim.x >>= 1;
-        grid_dim.x <<= 1;
-    }
-    uint64_t inc = total_count / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
-    if (bwd)
-        dropout_kernel_bwd<<<grid_dim, block_dim, 0, stream>>>(
-            total_count, ratio, vals, out, mask, seed);
-    else
-        dropout_kernel<<<grid_dim, block_dim, 0, stream>>>(
-            total_count, ratio, out, vals, mask, seed);
-}
-
-template void launch_dropout(float* out,
-                             const float* vals,
-                             uint8_t* mask,
-                             int total_count,
-                             int dim,
-                             float ratio,
-                             cudaStream_t stream,
-                             bool);
-template void launch_dropout(__half* out,
-                             const __half* vals,
-                             uint8_t* mask,
-                             int total_count,
-                             int dim,
-                             float ratio,
-                             cudaStream_t stream,
-                             bool);
-
-__global__ void dropout_grad_kernel(const int N, const float scale, float* Xdata, uint8_t* mask)
-{
-    CUDA_1D_KERNEL_LOOP(i, N) { Xdata[i] *= scale * mask[i]; }
-}
-
-__global__ void dropout_grad_kernel(const int N, const float scale, __half* Xdata, uint8_t* mask)
-{
-    const __half2 h_scale = __float2half2_rn(scale);
-    float2* x_cast = reinterpret_cast<float2*>(Xdata);
-    uint32_t* mask_cast = reinterpret_cast<uint32_t*>(mask);
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        float2 x_data = x_cast[j];
-        uint32_t m_32 = mask_cast[j];
-        uint8_t* m = (uint8_t*)&m_32;
-
-        float2 result_f;
-        __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-#ifdef __STOCHASTIC_MODE__
-
-        __half2* x_data_h = reinterpret_cast<__half2*>(&x_data);
-        __half2 mask_h[2];
-        float2 mask_f[2];
-
-        float* mask_f_data = &mask_f[0].x;
-#pragma unroll
-        for (int i = 0; i < unroll_factor; i++) *(mask_f_data++) = (float)(m[i]);
-
-        mask_h[0] = __float22half2_rn(mask_f[0]);
-        mask_h[1] = __float22half2_rn(mask_f[1]);
-
-        result_h[0] = x_data_h[0] * h_scale * mask_h[0];
-        result_h[1] = x_data_h[1] * h_scale * mask_h[1];
-
-#else
-
-        __half* x_data_h = reinterpret_cast<__half*>(&x_data);
-        float2 result[2];
-
-        result[0].x = (float)x_data_h[0] * scale * m[0];
-        result[0].y = (float)x_data_h[1] * scale * m[1];
-        result[1].x = (float)x_data_h[2] * scale * m[2];
-        result[1].y = (float)x_data_h[3] * scale * m[3];
-
-        result_h[0] = __float22half2_rn(result[0]);
-        result_h[1] = __float22half2_rn(result[1]);
-
-#endif
-        x_cast[j] = result_f;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        for (int i = high_index; i < N; i++) {
-            Xdata[i] = __float2half((float)Xdata[i] * scale * mask[i]);
-        }
-    }
-}
-
-template <typename T>
-void launch_dropout_grad(T* vals, uint8_t* mask, int total_count, float ratio, cudaStream_t stream)
-{
-    assert(unroll_factor == 4);
-
-    const float scale = 1. / (1. - ratio);
-    dropout_grad_kernel<<<DS_GET_BLOCKS(total_count / unroll_factor),
-                          DS_CUDA_NUM_THREADS,
-                          0,
-                          stream>>>(total_count, scale, vals, mask);
-}
-
-template void launch_dropout_grad(float* vals,
-                                  uint8_t* mask,
-                                  int total_count,
-                                  float ratio,
-                                  cudaStream_t stream);
-template void launch_dropout_grad(__half* vals,
-                                  uint8_t* mask,
-                                  int total_count,
-                                  float ratio,
-                                  cudaStream_t stream);
-
-__global__ void dropout_grad_kernel(const int N,
-                                    const float scale,
-                                    const float* Xdata,
-                                    float* out,
-                                    uint8_t* mask)
-{
-    CUDA_1D_KERNEL_LOOP(i, N) { out[i] = Xdata[i] * scale * mask[i]; }
-}
-
-__global__ void dropout_grad_kernel(const int N,
-                                    const float scale,
-                                    const __half* Xdata,
-                                    __half* out,
-                                    uint8_t* mask)
-{
-    const float2* x_cast = reinterpret_cast<const float2*>(Xdata);
-    float2* out_cast = reinterpret_cast<float2*>(out);
-    const uint32_t* mask_cast = reinterpret_cast<const uint32_t*>(mask);
-
-    float2 result_f;
-    __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        float2 x_data = x_cast[j];
-        uint32_t m_32 = mask_cast[j];
-        uint8_t* m = (uint8_t*)&m_32;
-
-        __half* x_data_h = reinterpret_cast<__half*>(&x_data);
-        float2 result[2];
-
-        result[0].x = (float)x_data_h[0] * scale * m[0];
-        result[0].y = (float)x_data_h[1] * scale * m[1];
-        result[1].x = (float)x_data_h[2] * scale * m[2];
-        result[1].y = (float)x_data_h[3] * scale * m[3];
-
-        result_h[0] = __float22half2_rn(result[0]);
-        result_h[1] = __float22half2_rn(result[1]);
-
-        out_cast[j] = result_f;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        for (int i = high_index; i < N; i++) {
-            out[i] = __float2half((float)Xdata[i] * scale * mask[i]);
-        }
-    }
-}
-
-template <typename T>
-void launch_dropout_grad(T* vals_out,
-                         const T* vals,
-                         uint8_t* mask,
-                         int total_count,
-                         float ratio,
-                         cudaStream_t stream)
-{
-    assert(unroll_factor == 4);
-
-    const float scale = 1. / (1. - ratio);
-    dropout_grad_kernel<<<DS_GET_BLOCKS(total_count / unroll_factor),
-                          DS_CUDA_NUM_THREADS,
-                          0,
-                          stream>>>(total_count, scale, vals, vals_out, mask);
-}
-template void launch_dropout_grad(float*,
-                                  const float* vals,
-                                  uint8_t* mask,
-                                  int total_count,
-                                  float ratio,
-                                  cudaStream_t stream);
-template void launch_dropout_grad(__half*,
-                                  const __half* vals,
-                                  uint8_t* mask,
-                                  int total_count,
-                                  float ratio,
-                                  cudaStream_t stream);
-
-__global__ void dropout_kernel(const int N,
-                               const int dim,
-                               const float ratio,
-                               const float* bias,
-                               float* Xdata,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int tid = threadIdx.x % (dim / unroll_factor);
-
-    curandStatePhilox4_32_10_t state;
-    curand_init(seed.first, idx, seed.second, &state);
-
-    float4* Xdata_cast = reinterpret_cast<float4*>(Xdata);
-    uint32_t* mask_32 = reinterpret_cast<uint32_t*>(mask);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 rand = curand_uniform4(&state);
-        uint32_t m_32;
-        uint8_t* m = (uint8_t*)&m_32;
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        float4 x_data = Xdata_cast[j];
-        float4 b_data = bias_cast[j % (dim / unroll_factor)];
-
-        x_data.x += b_data.x;
-        x_data.y += b_data.y;
-        x_data.z += b_data.z;
-        x_data.w += b_data.w;
-
-        x_data.x = x_data.x * scale * m[0];
-        x_data.y = x_data.y * scale * m[1];
-        x_data.z = x_data.z * scale * m[2];
-        x_data.w = x_data.w * scale * m[3];
-
-        mask_32[j] = m_32;
-        Xdata_cast[j] = x_data;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = curand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            float x_data = Xdata[i] + bias[i % dim];
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            Xdata[i] = x_data * scale * m;
-            mask[i] = m;
-        }
-    }
-}
-
-__global__ void dropout_kernel(const int N,
-                               const int dim,
-                               const float ratio,
-                               const __half* bias,
-                               __half* Xdata,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int tid = threadIdx.x % (dim / unroll_factor);
-
-    curandStatePhilox4_32_10_t state;
-    curand_init(seed.first, idx, seed.second, &state);
-
-    float2* Xdata_cast = reinterpret_cast<float2*>(Xdata);
-    uint32_t* mask_32 = reinterpret_cast<uint32_t*>(mask);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 rand = curand_uniform4(&state);
-
-        float2 data_f;
-        __half2* data_h = reinterpret_cast<__half2*>(&data_f);
-
-        float2 bias_f;
-        __half2* bias_h = reinterpret_cast<__half2*>(&bias_f);
-
-        data_f = Xdata_cast[j];
-        bias_f = bias_cast[j % (dim / unroll_factor)];
-
-        float2 data_h_0 = __half22float2(data_h[0]);
-        float2 data_h_1 = __half22float2(data_h[1]);
-
-        float2 bias_h_0 = __half22float2(bias_h[0]);
-        float2 bias_h_1 = __half22float2(bias_h[1]);
-
-        data_h_0.x += bias_h_0.x;
-        data_h_0.y += bias_h_0.y;
-        data_h_1.x += bias_h_1.x;
-        data_h_1.y += bias_h_1.y;
-
-        uint32_t m_32;
-        uint8_t* m = (uint8_t*)&m_32;
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        data_h_0.x = __float2half(data_h_0.x * scale * m[0]);
-        data_h_0.y = __float2half(data_h_0.y * scale * m[1]);
-        data_h_1.x = __float2half(data_h_1.x * scale * m[2]);
-        data_h_1.y = __float2half(data_h_1.y * scale * m[3]);
-
-        float2 result_f;
-        __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-        result_h[0] = __float22half2_rn(data_h_0);
-        result_h[1] = __float22half2_rn(data_h_1);
-
-        Xdata_cast[j] = result_f;
-        mask_32[j] = m_32;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = curand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            float x_data = (float)Xdata[i] + (float)bias[i % dim];
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            Xdata[i] = __float2half(x_data * scale * m);
-            mask[i] = m;
-        }
-    }
-}
-
-template <typename T>
-void launch_dropout(T* out,
-                    const T* bias,
-                    uint8_t* mask,
-                    int batch,
-                    int dim,
-                    float ratio,
-                    cudaStream_t stream)
-{
-    assert(unroll_factor == 4);
-
-    int total_count = batch * dim / unroll_factor;
-
-    dim3 grid_dim = DS_GET_BLOCKS(total_count);
-    dim3 block_dim = DS_CUDA_NUM_THREADS;
-
-    uint64_t inc = (batch * dim) / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
-
-    dropout_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        total_count, dim, ratio, bias, out, mask, seed);
-}
-
-template void launch_dropout(float*,
-                             const float* bias,
-                             uint8_t* mask,
-                             int batch,
-                             int dim,
-                             float ratio,
-                             cudaStream_t stream);
-template void launch_dropout(__half*,
-                             const __half* bias,
-                             uint8_t* mask,
-                             int batch,
-                             int dim,
-                             float ratio,
-                             cudaStream_t stream);
-
-__global__ void dropout_kernel(const int N,
-                               const int dim,
-                               const float ratio,
-                               const float* input,
-                               const float* residual,
-                               const float* bias,
-                               float* out,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int tid = threadIdx.x % (dim / unroll_factor);
-
-    curandStatePhilox4_32_10_t state;
-    curand_init(seed.first, idx, seed.second, &state);
-
-    float4* out_cast = reinterpret_cast<float4*>(out);
-    uint32_t* mask_32 = reinterpret_cast<uint32_t*>(mask);
-
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-    const float4* residual_cast = reinterpret_cast<const float4*>(residual);
-    const float4* input_cast = reinterpret_cast<const float4*>(input);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 rand = curand_uniform4(&state);
-
-        uint32_t m_32;
-        uint8_t* m = (uint8_t*)&m_32;
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        float4 out_data;
-        float4 b_data = bias_cast[j % (dim / unroll_factor)];
-        float4 res_data = residual_cast[j];
-        float4 inp_data = input_cast[j];
-
-        out_data.x = (b_data.x + inp_data.x);
-        out_data.y = (b_data.y + inp_data.y);
-        out_data.z = (b_data.z + inp_data.z);
-        out_data.w = (b_data.w + inp_data.w);
-
-        out_data.x = out_data.x * scale * m[0];
-        out_data.y = out_data.y * scale * m[1];
-        out_data.z = out_data.z * scale * m[2];
-        out_data.w = out_data.w * scale * m[3];
-
-        out_data.x += res_data.x;
-        out_data.y += res_data.y;
-        out_data.z += res_data.z;
-        out_data.w += res_data.w;
-
-        mask_32[j] = m_32;
-        out_cast[j] = out_data;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = curand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            float x_data = input[i] + bias[i % dim];
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            x_data = x_data * scale * m;
-            x_data += residual[i];
-
-            out[i] = x_data;
-            mask[i] = m;
-        }
-    }
-}
-
-__global__ void dropout_kernel(const int N,
-                               const int dim,
-                               const float ratio,
-                               const __half* input,
-                               const __half* residual,
-                               const __half* bias,
-                               __half* out,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int tid = threadIdx.x % (dim / unroll_factor);
-
-    curandStatePhilox4_32_10_t state;
-    curand_init(seed.first, idx, seed.second, &state);
-
-    float2* out_cast = reinterpret_cast<float2*>(out);
-    uint32_t* mask_32 = reinterpret_cast<uint32_t*>(mask);
-
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-    const float2* residual_cast = reinterpret_cast<const float2*>(residual);
-    const float2* input_cast = reinterpret_cast<const float2*>(input);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 rand = curand_uniform4(&state);
-
-        float2 data_f;
-        __half2* data_h = reinterpret_cast<__half2*>(&data_f);
-
-        float2 bias_f;
-        __half2* bias_h = reinterpret_cast<__half2*>(&bias_f);
-
-        float2 residual_f;
-        __half2* residual_h = reinterpret_cast<__half2*>(&residual_f);
-
-        float2 input_f;
-        __half2* input_h = reinterpret_cast<__half2*>(&input_f);
-
-        bias_f = bias_cast[j % (dim / unroll_factor)];
-        residual_f = residual_cast[j];
-        input_f = input_cast[j];
-
-        float2 data_h_0 = __half22float2(data_h[0]);
-        float2 data_h_1 = __half22float2(data_h[1]);
-
-        float2 bias_h_0 = __half22float2(bias_h[0]);
-        float2 bias_h_1 = __half22float2(bias_h[1]);
-
-        float2 residual_h_0 = __half22float2(residual_h[0]);
-        float2 residual_h_1 = __half22float2(residual_h[1]);
-
-        float2 input_h_0 = __half22float2(input_h[0]);
-        float2 input_h_1 = __half22float2(input_h[1]);
-
-        data_h_0.x = (bias_h_0.x + input_h_0.x);
-        data_h_0.y = (bias_h_0.y + input_h_0.y);
-        data_h_1.x = (bias_h_1.x + input_h_1.x);
-        data_h_1.y = (bias_h_1.y + input_h_1.y);
-
-        uint32_t m_32;
-        uint8_t* m = (uint8_t*)&m_32;
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        data_h_0.x = __float2half(data_h_0.x * scale * m[0]);
-        data_h_0.y = __float2half(data_h_0.y * scale * m[1]);
-        data_h_1.x = __float2half(data_h_1.x * scale * m[2]);
-        data_h_1.y = __float2half(data_h_1.y * scale * m[3]);
-
-        data_h_0.x += residual_h_0.x;
-        data_h_0.y += residual_h_0.y;
-        data_h_1.x += residual_h_1.x;
-        data_h_1.y += residual_h_1.y;
-
-        float2 result_f;
-        __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-        result_h[0] = __float22half2_rn(data_h_0);
-        result_h[1] = __float22half2_rn(data_h_1);
-
-        out_cast[j] = result_f;
-        mask_32[j] = m_32;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = curand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            float x_data = (float)input[i] + (float)bias[i % dim];
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            x_data = x_data * scale * m;
-            x_data += (float)residual[i];
-
-            out[i] = __float2half(x_data);
-            mask[i] = m;
-        }
-    }
-}
-
-template <typename T>
-void launch_dropout(T* out,
-                    const T* input,
-                    const T* residual,
-                    const T* bias,
-                    uint8_t* mask,
-                    int batch,
-                    int dim,
-                    float ratio,
-                    cudaStream_t stream)
-{
-    assert(unroll_factor == 4);
-
-    int total_count = batch * dim / unroll_factor;
-    dim3 grid_dim = DS_GET_BLOCKS(total_count);
-    dim3 block_dim = DS_CUDA_NUM_THREADS;
-
-    uint64_t inc = (batch * dim) / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
-
-    dropout_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        total_count, dim, ratio, input, residual, bias, out, mask, seed);
-}
-
-template void launch_dropout(float*,
-                             const float*,
-                             const float* residual,
-                             const float* bias,
-                             uint8_t* mask,
-                             int batch,
-                             int dim,
-                             float ratio,
-                             cudaStream_t stream);
-template void launch_dropout(__half*,
-                             const __half*,
-                             const __half* residual,
-                             const __half* bias,
-                             uint8_t* mask,
-                             int batch,
-                             int dim,
-                             float ratio,
-                             cudaStream_t stream);
diff --git a/deepspeed/ops/csrc/transformer/dropout_kernels.hip b/deepspeed/ops/csrc/transformer/dropout_kernels.hip
deleted file mode 100644
index a4b880a721e9833d10bccd0fa438acf6b14ded54..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/dropout_kernels.hip
+++ /dev/null
@@ -1,870 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-
-const int unroll_factor = 4;
-
-__global__ void dropout_kernel(const int N,
-                               const float ratio,
-                               float* out,
-                               const float* Xdata,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    hiprandStatePhilox4_32_10_t state;
-    hiprand_init(seed.first, idx, seed.second, &state);
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        float4 rand = hiprand_uniform4(&state);
-        uint8_t m[unroll_factor];
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        int i = j * unroll_factor;
-
-        mask[i] = (uint8_t)m[0];
-        mask[i + 1] = (uint8_t)m[1];
-        mask[i + 2] = (uint8_t)m[2];
-        mask[i + 3] = (uint8_t)m[3];
-
-        out[i] = Xdata[i] * scale * m[0];
-        out[i + 1] = Xdata[i + 1] * scale * m[1];
-        out[i + 2] = Xdata[i + 2] * scale * m[2];
-        out[i + 3] = Xdata[i + 3] * scale * m[3];
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = hiprand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            out[i] = Xdata[i] * scale * m;
-            mask[i] = m;
-        }
-    }
-}
-
-__global__ void dropout_kernel(const int N,
-                               const float ratio,
-                               __half* out,
-                               const __half* Xdata,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    hiprandStatePhilox4_32_10_t state;
-    hiprand_init(seed.first, idx, seed.second, &state);
-
-#ifdef __STOCHASTIC_MODE__
-
-    const __half2 h_scale = __float2half2_rn(scale);
-    const float2* x_cast = reinterpret_cast<const float2*>(Xdata);
-    float2* out_cast = reinterpret_cast<float2*>(out);
-    uint32_t* mask_cast = reinterpret_cast<uint32_t*>(mask);
-
-    uint32_t m_32;
-    uint8_t* m = reinterpret_cast<uint8_t*>(&m_32);
-
-    float2 result_f;
-    __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-    __half2 mask_h[2];
-    float2 mask_f[2];
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        float2 x_f = x_cast[j];
-        __half2* x_h = reinterpret_cast<__half2*>(&x_f);
-
-        float4 rand = hiprand_uniform4(&state);
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        float* mask_f_data = &mask_f[0].x;
-#pragma unroll
-        for (int i = 0; i < unroll_factor; i++) mask_f_data[i] = (float)(m[i]);
-
-        mask_h[0] = __float22half2_rn(mask_f[0]);
-        mask_h[1] = __float22half2_rn(mask_f[1]);
-
-        result_h[0] = x_h[0] * h_scale * mask_h[0];
-        result_h[1] = x_h[1] * h_scale * mask_h[1];
-
-        out_cast[j] = result_f;
-
-        mask_cast[j] = m_32;
-    }
-
-#else
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        int i = j * unroll_factor;
-
-        const __half2* vals_half = reinterpret_cast<const __half2*>(Xdata + i);
-        float2 vals_half_f[2];
-        vals_half_f[0] = __half22float2(vals_half[0]);
-        vals_half_f[1] = __half22float2(vals_half[1]);
-
-        uint8_t m[unroll_factor];
-        float4 rand = hiprand_uniform4(&state);
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        out[i] = __float2half(vals_half_f[0].x * scale * m[0]);
-        out[i + 1] = __float2half(vals_half_f[0].y * scale * m[1]);
-        out[i + 2] = __float2half(vals_half_f[1].x * scale * m[2]);
-        out[i + 3] = __float2half(vals_half_f[1].y * scale * m[3]);
-
-        mask[i] = m[0];
-        mask[i + 1] = m[1];
-        mask[i + 2] = m[2];
-        mask[i + 3] = m[3];
-    }
-
-#endif
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = hiprand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            out[i] = __float2half((float)Xdata[i] * scale * m);
-            mask[i] = m;
-        }
-    }
-}
-
-__global__ void dropout_kernel_bwd(const int N,
-                                   const float ratio,
-                                   const float* Xdata,
-                                   float* out,
-                                   uint8_t* mask,
-                                   std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        int i = j * unroll_factor;
-
-        out[i] = mask[i] ? Xdata[i] * scale : 0.0;
-        out[i + 1] = mask[i + 1] ? Xdata[i + 1] * scale : 0.0;
-        out[i + 2] = mask[i + 2] ? Xdata[i + 2] * scale : 0.0;
-        out[i + 3] = mask[i + 3] ? Xdata[i + 3] * scale : 0.0;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        for (int i = high_index; i < N; i++) { out[i] = mask[i] ? Xdata[i] * scale : 0.0; }
-    }
-}
-
-__global__ void dropout_kernel_bwd(const int N,
-                                   const float ratio,
-                                   const __half* Xdata,
-                                   __half* out,
-                                   uint8_t* mask,
-                                   std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-
-#ifdef __STOCHASTIC_MODE__
-
-    const __half2 h_scale = __float2half2_rn(scale);
-
-    const float2* x_cast = reinterpret_cast<const float2*>(Xdata);
-    float2* out_cast = reinterpret_cast<float2*>(out);
-    uint32_t* mask_cast = reinterpret_cast<uint32_t*>(mask);
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        float2 x_f = x_cast[j];
-        __half2* x_h = reinterpret_cast<__half2*>(&x_f);
-
-        uint32_t m_32 = mask_cast[j];
-        uint8_t* m = (uint8_t*)&m_32;
-
-        __half2 mask_h[2];
-        float2 mask_f[2];
-
-        float* mask_f_data = &mask_f[0].x;
-#pragma unroll
-        for (int i = 0; i < unroll_factor; i++) mask_f_data[i] = (float)(m[i]);
-
-#pragma unroll
-        for (int i = 0; i < 2; i++) mask_h[i] = __float22half2_rn(mask_f[i]);
-
-        float2 result_f;
-        __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-        result_h[0] = x_h[0] * h_scale * mask_h[0];
-        result_h[1] = x_h[1] * h_scale * mask_h[1];
-
-        out_cast[j] = result_f;
-    }
-
-#else
-
-    const __half h_scale = __float2half(scale);
-    const __half h_zero = __float2half(0.0);
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        int i = j * unroll_factor;
-
-        const __half2* vals_half = reinterpret_cast<const __half2*>(Xdata + i);
-
-        uint8_t* m = mask + i;
-
-        float2 vals_half_f[2];
-
-        vals_half_f[0] = __half22float2(vals_half[0]);
-        vals_half_f[1] = __half22float2(vals_half[1]);
-
-        out[i] = __float2half(vals_half_f[0].x * scale * m[0]);
-        out[i + 1] = __float2half(vals_half_f[0].y * scale * m[1]);
-        out[i + 2] = __float2half(vals_half_f[1].x * scale * m[2]);
-        out[i + 3] = __float2half(vals_half_f[1].y * scale * m[3]);
-    }
-
-#endif
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        for (int i = high_index; i < N; i++) {
-            out[i] = __float2half((float)Xdata[i] * scale * mask[i]);
-        }
-    }
-}
-
-template <typename T>
-void launch_dropout(T* out,
-                    const T* vals,
-                    uint8_t* mask,
-                    int total_count,
-                    int dim,
-                    float ratio,
-                    hipStream_t stream,
-                    bool bwd)
-{
-    assert(unroll_factor == 4);
-
-    dim3 grid_dim = DS_GET_BLOCKS(total_count / unroll_factor);
-    dim3 block_dim = DS_CUDA_NUM_THREADS;
-
-    if (dim > 512) {
-        block_dim.x >>= 1;
-        grid_dim.x <<= 1;
-    }
-    uint64_t inc = total_count / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
-    if (bwd)
-       hipLaunchKernelGGL(( dropout_kernel_bwd), dim3(grid_dim), dim3(block_dim), 0, stream, 
-            total_count, ratio, vals, out, mask, seed);
-    else
-       hipLaunchKernelGGL(( dropout_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, 
-            total_count, ratio, out, vals, mask, seed);
-}
-
-template void launch_dropout(float* out,
-                             const float* vals,
-                             uint8_t* mask,
-                             int total_count,
-                             int dim,
-                             float ratio,
-                             hipStream_t stream,
-                             bool);
-template void launch_dropout(__half* out,
-                             const __half* vals,
-                             uint8_t* mask,
-                             int total_count,
-                             int dim,
-                             float ratio,
-                             hipStream_t stream,
-                             bool);
-
-__global__ void dropout_grad_kernel(const int N, const float scale, float* Xdata, uint8_t* mask)
-{
-    CUDA_1D_KERNEL_LOOP(i, N) { Xdata[i] *= scale * mask[i]; }
-}
-
-__global__ void dropout_grad_kernel(const int N, const float scale, __half* Xdata, uint8_t* mask)
-{
-    const __half2 h_scale = __float2half2_rn(scale);
-    float2* x_cast = reinterpret_cast<float2*>(Xdata);
-    uint32_t* mask_cast = reinterpret_cast<uint32_t*>(mask);
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        float2 x_data = x_cast[j];
-        uint32_t m_32 = mask_cast[j];
-        uint8_t* m = (uint8_t*)&m_32;
-
-        float2 result_f;
-        __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-#ifdef __STOCHASTIC_MODE__
-
-        __half2* x_data_h = reinterpret_cast<__half2*>(&x_data);
-        __half2 mask_h[2];
-        float2 mask_f[2];
-
-        float* mask_f_data = &mask_f[0].x;
-#pragma unroll
-        for (int i = 0; i < unroll_factor; i++) *(mask_f_data++) = (float)(m[i]);
-
-        mask_h[0] = __float22half2_rn(mask_f[0]);
-        mask_h[1] = __float22half2_rn(mask_f[1]);
-
-        result_h[0] = x_data_h[0] * h_scale * mask_h[0];
-        result_h[1] = x_data_h[1] * h_scale * mask_h[1];
-
-#else
-
-        __half* x_data_h = reinterpret_cast<__half*>(&x_data);
-        float2 result[2];
-
-        result[0].x = (float)x_data_h[0] * scale * m[0];
-        result[0].y = (float)x_data_h[1] * scale * m[1];
-        result[1].x = (float)x_data_h[2] * scale * m[2];
-        result[1].y = (float)x_data_h[3] * scale * m[3];
-
-        result_h[0] = __float22half2_rn(result[0]);
-        result_h[1] = __float22half2_rn(result[1]);
-
-#endif
-        x_cast[j] = result_f;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        for (int i = high_index; i < N; i++) {
-            Xdata[i] = __float2half((float)Xdata[i] * scale * mask[i]);
-        }
-    }
-}
-
-template <typename T>
-void launch_dropout_grad(T* vals, uint8_t* mask, int total_count, float ratio, hipStream_t stream)
-{
-    assert(unroll_factor == 4);
-
-    const float scale = 1. / (1. - ratio);
-   hipLaunchKernelGGL(( dropout_grad_kernel), dim3(DS_GET_BLOCKS(total_count / unroll_factor)),
-                          dim3(DS_CUDA_NUM_THREADS),
-                          0,
-                          stream, total_count, scale, vals, mask);
-}
-
-template void launch_dropout_grad(float* vals,
-                                  uint8_t* mask,
-                                  int total_count,
-                                  float ratio,
-                                  hipStream_t stream);
-template void launch_dropout_grad(__half* vals,
-                                  uint8_t* mask,
-                                  int total_count,
-                                  float ratio,
-                                  hipStream_t stream);
-
-__global__ void dropout_grad_kernel(const int N,
-                                    const float scale,
-                                    const float* Xdata,
-                                    float* out,
-                                    uint8_t* mask)
-{
-    CUDA_1D_KERNEL_LOOP(i, N) { out[i] = Xdata[i] * scale * mask[i]; }
-}
-
-__global__ void dropout_grad_kernel(const int N,
-                                    const float scale,
-                                    const __half* Xdata,
-                                    __half* out,
-                                    uint8_t* mask)
-{
-    const float2* x_cast = reinterpret_cast<const float2*>(Xdata);
-    float2* out_cast = reinterpret_cast<float2*>(out);
-    const uint32_t* mask_cast = reinterpret_cast<const uint32_t*>(mask);
-
-    float2 result_f;
-    __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        float2 x_data = x_cast[j];
-        uint32_t m_32 = mask_cast[j];
-        uint8_t* m = (uint8_t*)&m_32;
-
-        __half* x_data_h = reinterpret_cast<__half*>(&x_data);
-        float2 result[2];
-
-        result[0].x = (float)x_data_h[0] * scale * m[0];
-        result[0].y = (float)x_data_h[1] * scale * m[1];
-        result[1].x = (float)x_data_h[2] * scale * m[2];
-        result[1].y = (float)x_data_h[3] * scale * m[3];
-
-        result_h[0] = __float22half2_rn(result[0]);
-        result_h[1] = __float22half2_rn(result[1]);
-
-        out_cast[j] = result_f;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        for (int i = high_index; i < N; i++) {
-            out[i] = __float2half((float)Xdata[i] * scale * mask[i]);
-        }
-    }
-}
-
-template <typename T>
-void launch_dropout_grad(T* vals_out,
-                         const T* vals,
-                         uint8_t* mask,
-                         int total_count,
-                         float ratio,
-                         hipStream_t stream)
-{
-    assert(unroll_factor == 4);
-
-    const float scale = 1. / (1. - ratio);
-   hipLaunchKernelGGL(( dropout_grad_kernel), dim3(DS_GET_BLOCKS(total_count / unroll_factor)),
-                          dim3(DS_CUDA_NUM_THREADS),
-                          0,
-                          stream, total_count, scale, vals, vals_out, mask);
-}
-template void launch_dropout_grad(float*,
-                                  const float* vals,
-                                  uint8_t* mask,
-                                  int total_count,
-                                  float ratio,
-                                  hipStream_t stream);
-template void launch_dropout_grad(__half*,
-                                  const __half* vals,
-                                  uint8_t* mask,
-                                  int total_count,
-                                  float ratio,
-                                  hipStream_t stream);
-
-__global__ void dropout_kernel(const int N,
-                               const int dim,
-                               const float ratio,
-                               const float* bias,
-                               float* Xdata,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int tid = threadIdx.x % (dim / unroll_factor);
-
-    hiprandStatePhilox4_32_10_t state;
-    hiprand_init(seed.first, idx, seed.second, &state);
-
-    float4* Xdata_cast = reinterpret_cast<float4*>(Xdata);
-    uint32_t* mask_32 = reinterpret_cast<uint32_t*>(mask);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 rand = hiprand_uniform4(&state);
-        uint32_t m_32;
-        uint8_t* m = (uint8_t*)&m_32;
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        float4 x_data = Xdata_cast[j];
-        float4 b_data = bias_cast[j % (dim / unroll_factor)];
-
-        x_data.x += b_data.x;
-        x_data.y += b_data.y;
-        x_data.z += b_data.z;
-        x_data.w += b_data.w;
-
-        x_data.x = x_data.x * scale * m[0];
-        x_data.y = x_data.y * scale * m[1];
-        x_data.z = x_data.z * scale * m[2];
-        x_data.w = x_data.w * scale * m[3];
-
-        mask_32[j] = m_32;
-        Xdata_cast[j] = x_data;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = hiprand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            float x_data = Xdata[i] + bias[i % dim];
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            Xdata[i] = x_data * scale * m;
-            mask[i] = m;
-        }
-    }
-}
-
-__global__ void dropout_kernel(const int N,
-                               const int dim,
-                               const float ratio,
-                               const __half* bias,
-                               __half* Xdata,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int tid = threadIdx.x % (dim / unroll_factor);
-
-    hiprandStatePhilox4_32_10_t state;
-    hiprand_init(seed.first, idx, seed.second, &state);
-
-    float2* Xdata_cast = reinterpret_cast<float2*>(Xdata);
-    uint32_t* mask_32 = reinterpret_cast<uint32_t*>(mask);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 rand = hiprand_uniform4(&state);
-
-        float2 data_f;
-        __half2* data_h = reinterpret_cast<__half2*>(&data_f);
-
-        float2 bias_f;
-        __half2* bias_h = reinterpret_cast<__half2*>(&bias_f);
-
-        data_f = Xdata_cast[j];
-        bias_f = bias_cast[j % (dim / unroll_factor)];
-
-        float2 data_h_0 = __half22float2(data_h[0]);
-        float2 data_h_1 = __half22float2(data_h[1]);
-
-        float2 bias_h_0 = __half22float2(bias_h[0]);
-        float2 bias_h_1 = __half22float2(bias_h[1]);
-
-        data_h_0.x += bias_h_0.x;
-        data_h_0.y += bias_h_0.y;
-        data_h_1.x += bias_h_1.x;
-        data_h_1.y += bias_h_1.y;
-
-        uint32_t m_32;
-        uint8_t* m = (uint8_t*)&m_32;
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        data_h_0.x = __float2half(data_h_0.x * scale * m[0]);
-        data_h_0.y = __float2half(data_h_0.y * scale * m[1]);
-        data_h_1.x = __float2half(data_h_1.x * scale * m[2]);
-        data_h_1.y = __float2half(data_h_1.y * scale * m[3]);
-
-        float2 result_f;
-        __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-        result_h[0] = __float22half2_rn(data_h_0);
-        result_h[1] = __float22half2_rn(data_h_1);
-
-        Xdata_cast[j] = result_f;
-        mask_32[j] = m_32;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = hiprand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            float x_data = (float)Xdata[i] + (float)bias[i % dim];
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            Xdata[i] = __float2half(x_data * scale * m);
-            mask[i] = m;
-        }
-    }
-}
-
-template <typename T>
-void launch_dropout(T* out,
-                    const T* bias,
-                    uint8_t* mask,
-                    int batch,
-                    int dim,
-                    float ratio,
-                    hipStream_t stream)
-{
-    assert(unroll_factor == 4);
-
-    int total_count = batch * dim / unroll_factor;
-
-    dim3 grid_dim = DS_GET_BLOCKS(total_count);
-    dim3 block_dim = DS_CUDA_NUM_THREADS;
-
-    uint64_t inc = (batch * dim) / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
-
-   hipLaunchKernelGGL(( dropout_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        total_count, dim, ratio, bias, out, mask, seed);
-}
-
-template void launch_dropout(float*,
-                             const float* bias,
-                             uint8_t* mask,
-                             int batch,
-                             int dim,
-                             float ratio,
-                             hipStream_t stream);
-template void launch_dropout(__half*,
-                             const __half* bias,
-                             uint8_t* mask,
-                             int batch,
-                             int dim,
-                             float ratio,
-                             hipStream_t stream);
-
-__global__ void dropout_kernel(const int N,
-                               const int dim,
-                               const float ratio,
-                               const float* input,
-                               const float* residual,
-                               const float* bias,
-                               float* out,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int tid = threadIdx.x % (dim / unroll_factor);
-
-    hiprandStatePhilox4_32_10_t state;
-    hiprand_init(seed.first, idx, seed.second, &state);
-
-    float4* out_cast = reinterpret_cast<float4*>(out);
-    uint32_t* mask_32 = reinterpret_cast<uint32_t*>(mask);
-
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-    const float4* residual_cast = reinterpret_cast<const float4*>(residual);
-    const float4* input_cast = reinterpret_cast<const float4*>(input);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 rand = hiprand_uniform4(&state);
-
-        uint32_t m_32;
-        uint8_t* m = (uint8_t*)&m_32;
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        float4 out_data;
-        float4 b_data = bias_cast[j % (dim / unroll_factor)];
-        float4 res_data = residual_cast[j];
-        float4 inp_data = input_cast[j];
-
-        out_data.x = (b_data.x + inp_data.x);
-        out_data.y = (b_data.y + inp_data.y);
-        out_data.z = (b_data.z + inp_data.z);
-        out_data.w = (b_data.w + inp_data.w);
-
-        out_data.x = out_data.x * scale * m[0];
-        out_data.y = out_data.y * scale * m[1];
-        out_data.z = out_data.z * scale * m[2];
-        out_data.w = out_data.w * scale * m[3];
-
-        out_data.x += res_data.x;
-        out_data.y += res_data.y;
-        out_data.z += res_data.z;
-        out_data.w += res_data.w;
-
-        mask_32[j] = m_32;
-        out_cast[j] = out_data;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = hiprand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            float x_data = input[i] + bias[i % dim];
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            x_data = x_data * scale * m;
-            x_data += residual[i];
-
-            out[i] = x_data;
-            mask[i] = m;
-        }
-    }
-}
-
-__global__ void dropout_kernel(const int N,
-                               const int dim,
-                               const float ratio,
-                               const __half* input,
-                               const __half* residual,
-                               const __half* bias,
-                               __half* out,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int tid = threadIdx.x % (dim / unroll_factor);
-
-    hiprandStatePhilox4_32_10_t state;
-    hiprand_init(seed.first, idx, seed.second, &state);
-
-    float2* out_cast = reinterpret_cast<float2*>(out);
-    uint32_t* mask_32 = reinterpret_cast<uint32_t*>(mask);
-
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-    const float2* residual_cast = reinterpret_cast<const float2*>(residual);
-    const float2* input_cast = reinterpret_cast<const float2*>(input);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 rand = hiprand_uniform4(&state);
-
-        float2 data_f;
-        __half2* data_h = reinterpret_cast<__half2*>(&data_f);
-
-        float2 bias_f;
-        __half2* bias_h = reinterpret_cast<__half2*>(&bias_f);
-
-        float2 residual_f;
-        __half2* residual_h = reinterpret_cast<__half2*>(&residual_f);
-
-        float2 input_f;
-        __half2* input_h = reinterpret_cast<__half2*>(&input_f);
-
-        bias_f = bias_cast[j % (dim / unroll_factor)];
-        residual_f = residual_cast[j];
-        input_f = input_cast[j];
-
-        float2 data_h_0 = __half22float2(data_h[0]);
-        float2 data_h_1 = __half22float2(data_h[1]);
-
-        float2 bias_h_0 = __half22float2(bias_h[0]);
-        float2 bias_h_1 = __half22float2(bias_h[1]);
-
-        float2 residual_h_0 = __half22float2(residual_h[0]);
-        float2 residual_h_1 = __half22float2(residual_h[1]);
-
-        float2 input_h_0 = __half22float2(input_h[0]);
-        float2 input_h_1 = __half22float2(input_h[1]);
-
-        data_h_0.x = (bias_h_0.x + input_h_0.x);
-        data_h_0.y = (bias_h_0.y + input_h_0.y);
-        data_h_1.x = (bias_h_1.x + input_h_1.x);
-        data_h_1.y = (bias_h_1.y + input_h_1.y);
-
-        uint32_t m_32;
-        uint8_t* m = (uint8_t*)&m_32;
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        data_h_0.x = __float2half(data_h_0.x * scale * m[0]);
-        data_h_0.y = __float2half(data_h_0.y * scale * m[1]);
-        data_h_1.x = __float2half(data_h_1.x * scale * m[2]);
-        data_h_1.y = __float2half(data_h_1.y * scale * m[3]);
-
-        data_h_0.x += residual_h_0.x;
-        data_h_0.y += residual_h_0.y;
-        data_h_1.x += residual_h_1.x;
-        data_h_1.y += residual_h_1.y;
-
-        float2 result_f;
-        __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-        result_h[0] = __float22half2_rn(data_h_0);
-        result_h[1] = __float22half2_rn(data_h_1);
-
-        out_cast[j] = result_f;
-        mask_32[j] = m_32;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = hiprand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            float x_data = (float)input[i] + (float)bias[i % dim];
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            x_data = x_data * scale * m;
-            x_data += (float)residual[i];
-
-            out[i] = __float2half(x_data);
-            mask[i] = m;
-        }
-    }
-}
-
-template <typename T>
-void launch_dropout(T* out,
-                    const T* input,
-                    const T* residual,
-                    const T* bias,
-                    uint8_t* mask,
-                    int batch,
-                    int dim,
-                    float ratio,
-                    hipStream_t stream)
-{
-    assert(unroll_factor == 4);
-
-    int total_count = batch * dim / unroll_factor;
-    dim3 grid_dim = DS_GET_BLOCKS(total_count);
-    dim3 block_dim = DS_CUDA_NUM_THREADS;
-
-    uint64_t inc = (batch * dim) / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
-
-   hipLaunchKernelGGL(( dropout_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        total_count, dim, ratio, input, residual, bias, out, mask, seed);
-}
-
-template void launch_dropout(float*,
-                             const float*,
-                             const float* residual,
-                             const float* bias,
-                             uint8_t* mask,
-                             int batch,
-                             int dim,
-                             float ratio,
-                             hipStream_t stream);
-template void launch_dropout(__half*,
-                             const __half*,
-                             const __half* residual,
-                             const __half* bias,
-                             uint8_t* mask,
-                             int batch,
-                             int dim,
-                             float ratio,
-                             hipStream_t stream);
diff --git a/deepspeed/ops/csrc/transformer/ds_transformer_cuda.cpp b/deepspeed/ops/csrc/transformer/ds_transformer_cuda.cpp
deleted file mode 100644
index 0e86322291f76573730b56fb25fb5e89f3d09ddd..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/ds_transformer_cuda.cpp
+++ /dev/null
@@ -1,1051 +0,0 @@
-#include <torch/extension.h>
-
-#include <cublas_v2.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <type_traits>
-#include <unordered_map>
-#include <vector>
-#include "Timer.h"
-#include "context.h"
-#include "cublas_wrappers.h"
-#include "custom_cuda_layers.h"
-#include "ds_transformer_cuda.h"
-
-static std::unordered_map<int, std::shared_ptr<void>> s_transformer_layers;
-
-const int init_seq_length = 128;
-
-// C++ interface
-
-template <typename T>
-unsigned get_workspace_size(unsigned maxBatchSize,
-                            unsigned seq_len,
-                            unsigned hidden_size,
-                            unsigned intermediate_size,
-                            unsigned heads,
-                            bool training,
-                            bool gelu_checkpoint)
-{
-    unsigned workSpacesize = 4 * (size_t(maxBatchSize) * seq_len * hidden_size);
-    if (training) {
-        workSpacesize += 2 * (size_t(maxBatchSize) * seq_len * hidden_size);
-        workSpacesize += ((std::max)((size_t(maxBatchSize) * seq_len * intermediate_size),
-                                     2 * (size_t(maxBatchSize) * heads * seq_len * seq_len)));
-        if (gelu_checkpoint)
-            workSpacesize += 2 * (size_t(maxBatchSize) * seq_len * intermediate_size);
-    }
-    return workSpacesize;  // * sizeof(T);
-}
-
-// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) \
-    CHECK_CUDA(x);     \
-    CHECK_CONTIGUOUS(x)
-
-template <typename T>
-BertTransformerLayer<T>::BertTransformerLayer(unsigned layer_id,
-                                              unsigned batch_size,
-                                              unsigned hidden_size,
-                                              unsigned num_heads,
-                                              unsigned intermediate_size,
-                                              unsigned seq_length,
-                                              float attn_prob_dropout_ratio,
-                                              float hidden_output_dropout_ratio,
-                                              float layer_norm_eps,
-                                              bool pre_or_postLayerNorm,
-                                              const std::vector<std::array<int, 3>>& gemm_algos,
-                                              bool attn_dropout_checkpoint,
-                                              bool normalize_invertible,
-                                              bool gelu_checkpoint,
-                                              bool stochastic_mode)
-    : _layer_id(layer_id),
-      _batch_size(batch_size),
-      _hidden_size(hidden_size),
-      _heads(num_heads),
-      _intermediate_size(intermediate_size),
-      _seq_length(seq_length),
-      _training(true),
-      _pre_or_postLayerNorm(pre_or_postLayerNorm),
-      _attn_dropout_checkpoint(attn_dropout_checkpoint),
-      _normalize_invertible(normalize_invertible),
-      _gelu_checkpoint(gelu_checkpoint),
-      _stochastic_mode(stochastic_mode),
-      _stream(Context::Instance().GetCurrentStream()),
-      _cublasHandle(Context::Instance().GetCublasHandle()),
-      _qkv_linear(typename FeedForward<T>::Config(batch_size * seq_length,
-                                                  3 * hidden_size,
-                                                  hidden_size,
-                                                  gemm_algos[0])),
-      _attn_out_linear(typename FeedForward<T>::Config(batch_size * seq_length,
-                                                       hidden_size,
-                                                       hidden_size,
-                                                       gemm_algos[0])),
-      _attn_layer_norm(typename Normalize_Layer<T>::Config(batch_size,
-                                                           seq_length,
-                                                           hidden_size,
-                                                           layer_norm_eps,
-                                                           true,
-                                                           !normalize_invertible)),
-      _layer_norm(typename Normalize_Layer<T>::Config(batch_size,
-                                                      seq_length,
-                                                      hidden_size,
-                                                      layer_norm_eps,
-                                                      true,
-                                                      !normalize_invertible)),
-      _ff1(typename FeedForward<T>::Config(batch_size * seq_length,
-                                           _intermediate_size,
-                                           hidden_size,
-                                           gemm_algos[1])),
-      _ff2(typename FeedForward<T>::Config(batch_size * seq_length,
-                                           hidden_size,
-                                           _intermediate_size,
-                                           gemm_algos[2])),
-      _softmax(typename Softmax<T>::Config(batch_size, num_heads, seq_length)),
-      _gelu(typename Gelu<T>::Config(_intermediate_size)),
-      _attn_prob_dropout(typename Dropout<T>::Config(attn_prob_dropout_ratio, _seq_length)),
-      _attn_output_dropout(typename Dropout<T>::Config(hidden_output_dropout_ratio, _hidden_size)),
-      _layer_output_dropout(typename Dropout<T>::Config(hidden_output_dropout_ratio, _hidden_size)),
-      _attn_scores(typename StridedBatchGemm<T>::Config(_batch_size * _heads,
-                                                        _seq_length,
-                                                        _seq_length,
-                                                        _hidden_size / _heads,
-                                                        //(T(1.0) / T(sqrt(_hidden_size / _heads))),
-                                                        //aiss debug 0506
-                                                        (T(1.0 / (sqrt(_hidden_size / _heads)))),
-                                                        T(0.0),
-                                                        CUBLAS_OP_T,
-                                                        CUBLAS_OP_N,
-                                                        gemm_algos[3])),
-      _attn_context(typename StridedBatchGemm<T>::Config(_batch_size * _heads,
-                                                         _hidden_size / _heads,
-                                                         _seq_length,
-                                                         _seq_length,
-                                                         T(1.0),
-                                                         T(0.0),
-                                                         CUBLAS_OP_N,
-                                                         CUBLAS_OP_N,
-                                                         gemm_algos[4]))
-{
-    assert(_hidden_size % _heads == 0);
-
-    Initialize();
-}
-
-template <typename T>
-BertTransformerLayer<T>::~BertTransformerLayer()
-{
-}
-
-template <typename T>
-void BertTransformerLayer<T>::Initialize()
-{
-#ifndef __HIP_PLATFORM_HCC__
-    if (std::is_same<T, __half>::value) cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
-#endif
-}
-
-template <typename T>
-void BertTransformerLayer<T>::Forward(unsigned bsz,
-                                      const T* input_ptr,
-                                      const T* input_mask_ptr,
-                                      const T* attn_qkvw_ptr,
-                                      const T* attn_qkvb_ptr,
-                                      const T* attn_ow_ptr,
-                                      const T* attn_ob_ptr,
-                                      const T* attn_nw_ptr,
-                                      const T* attn_nb_ptr,
-                                      const T* inter_w_ptr,
-                                      const T* inter_b_ptr,
-                                      const T* output_w_ptr,
-                                      const T* output_b_ptr,
-                                      const T* norm_w_ptr,
-                                      const T* norm_b_ptr,
-                                      T* out_ptr,
-                                      T* inp_norm_ptr,
-                                      T* q_tf_ptr,
-                                      T* k_tf_ptr,
-                                      T* v_tf_ptr,
-                                      T* soft_out_ptr,
-                                      T* ctx_bufB_ptr,
-                                      T* attn_o_inp_ptr,
-                                      T* add_res_ptr,
-                                      T* ff1_inp_ptr,
-                                      T* gelu_inp_ptr,
-                                      T* ff2_inp_ptr)
-{
-    cublasSetStream(_cublasHandle, _stream);
-
-    if (!_stochastic_mode) cudaStreamSynchronize(_stream);
-
-    T* workspace = static_cast<T*>(Context::Instance().GetWorkSpace());
-    size_t small_buf_size = bsz * _seq_length * _hidden_size;
-    T* buf_0 = workspace;
-    T* buf_1 = buf_0 + small_buf_size;
-    T* buf_2 = buf_1;
-
-    if (_normalize_invertible) {
-        add_res_ptr = buf_1 + 3 * small_buf_size;
-        buf_2 = add_res_ptr;
-    }
-    if (_gelu_checkpoint) buf_2 += small_buf_size;
-    if (_attn_dropout_checkpoint)
-        ctx_bufB_ptr =
-            (_gelu_checkpoint ? (buf_2 + (_intermediate_size / _hidden_size) * small_buf_size)
-                              : (buf_1 + 4 * small_buf_size));
-
-    int bsz_seq = bsz * _seq_length;
-
-    if (_pre_or_postLayerNorm) {
-        if (_layer_norm.UseMean())
-            _layer_norm.ForwardCheckpoint(
-                bsz_seq, inp_norm_ptr, input_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
-
-        else
-            _layer_norm.Forward(
-                bsz_seq, inp_norm_ptr, input_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
-    }
-
-    if (_pre_or_postLayerNorm)
-        _qkv_linear.Forward(bsz_seq, inp_norm_ptr, attn_qkvw_ptr, buf_0, _cublasHandle);
-    else
-        _qkv_linear.Forward(bsz_seq, input_ptr, attn_qkvw_ptr, buf_0, _cublasHandle);
-
-    launch_bias_add_transform_0213<T>(
-        q_tf_ptr, buf_0, attn_qkvb_ptr, bsz, _seq_length, _hidden_size, _heads, _stream, 3);
-
-    int bsz_heads = bsz * _heads;
-
-    // attention scores
-    _attn_scores.Forward(bsz_heads, soft_out_ptr, k_tf_ptr, q_tf_ptr, _cublasHandle);
-
-    // Softmax + Mask
-    _softmax.Forward(bsz, soft_out_ptr, input_mask_ptr, _stream);
-
-    // attn prob dropout.
-    _attn_prob_dropout.Forward(bsz_heads * _seq_length, ctx_bufB_ptr, soft_out_ptr, _stream);
-
-    // attention context
-    _attn_context.Forward(bsz_heads, buf_1, v_tf_ptr, ctx_bufB_ptr, _cublasHandle);
-
-    launch_transform4d_0213<T>(
-        attn_o_inp_ptr, buf_1, bsz, _heads, _seq_length, _hidden_size, _stream, 1);
-
-    if (_pre_or_postLayerNorm)
-        _attn_out_linear.Forward(bsz_seq, attn_o_inp_ptr, attn_ow_ptr, buf_1, _cublasHandle);
-    else
-        _attn_out_linear.Forward(bsz_seq, attn_o_inp_ptr, attn_ow_ptr, ff1_inp_ptr, _cublasHandle);
-
-    // attn output dropout.
-    if (_pre_or_postLayerNorm)
-        _attn_output_dropout.ForwardWithBias(
-            bsz_seq, add_res_ptr, buf_1, input_ptr, attn_ob_ptr, _stream);
-    else
-        _attn_output_dropout.ForwardWithBias(
-            bsz_seq, add_res_ptr, ff1_inp_ptr, input_ptr, attn_ob_ptr, _stream);
-
-    if (_pre_or_postLayerNorm) {
-        if (_attn_layer_norm.UseMean())
-            _attn_layer_norm.ForwardCheckpoint(
-                bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
-        else
-            _attn_layer_norm.Forward(
-                bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
-    } else {
-        if (_attn_layer_norm.UseMean())
-            _attn_layer_norm.ForwardCheckpoint(
-                bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
-        else
-            _attn_layer_norm.Forward(
-                bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
-    }
-
-    _ff1.Forward(bsz_seq,
-                 ff1_inp_ptr,
-                 inter_w_ptr,
-                 (_gelu_checkpoint ? ff2_inp_ptr : gelu_inp_ptr),
-                 _cublasHandle);
-
-    _gelu.ForwardWithBiasAdd(bsz_seq,
-                             (_gelu_checkpoint ? ff2_inp_ptr : gelu_inp_ptr),
-                             inter_b_ptr,
-                             (_gelu_checkpoint ? buf_2 : ff2_inp_ptr),
-                             _stream);
-
-    _ff2.Forward(
-        bsz_seq, (_gelu_checkpoint ? buf_2 : ff2_inp_ptr), output_w_ptr, out_ptr, _cublasHandle);
-
-    // layer output dropout.
-    if (_pre_or_postLayerNorm)
-        _layer_output_dropout.ForwardWithBias(
-            bsz_seq, out_ptr, out_ptr, add_res_ptr, output_b_ptr, _stream);
-    else
-        _layer_output_dropout.ForwardWithBias(
-            bsz_seq, inp_norm_ptr, out_ptr, ff1_inp_ptr, output_b_ptr, _stream);
-
-    if (!_pre_or_postLayerNorm) {
-        if (_layer_norm.UseMean())
-            _layer_norm.ForwardCheckpoint(
-                bsz_seq, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
-        else
-            _layer_norm.Forward(
-                bsz_seq, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
-    }
-}
-
-template <typename T>
-void BertTransformerLayer<T>::Backward(unsigned bsz,
-                                       const T* grad_output_ptr,
-                                       const T* input_ptr,
-                                       const T* output_ptr,
-                                       const T* inp_norm_ptr,
-                                       const T* q_tf_ptr,
-                                       const T* k_tf_ptr,
-                                       const T* v_tf_ptr,
-                                       const T* soft_out_ptr,
-                                       const T* ctx_bufB_ptr,
-                                       const T* attn_o_inp_ptr,
-                                       const T* add_res_ptr,
-                                       const T* ff1_inp_ptr,
-                                       const T* gelu_inp_ptr,
-                                       const T* ff2_inp_ptr,
-                                       const T* input_mask_ptr,
-                                       const T* attn_qkvw_ptr,
-                                       const T* attn_ow_ptr,
-                                       const T* attn_nw_ptr,
-                                       const T* attn_nb_ptr,
-                                       const T* inter_w_ptr,
-                                       const T* inter_b_ptr,
-                                       const T* output_w_ptr,
-                                       const T* norm_w_ptr,
-                                       const T* norm_b_ptr,
-
-                                       T* grad_input_ptr,
-                                       T* grad_attn_qkvw_ptr,
-                                       T* grad_attn_qkvb_ptr,
-                                       T* grad_attn_ow_ptr,
-                                       T* grad_attn_ob_ptr,
-                                       T* grad_attn_nw_ptr,
-                                       T* grad_attn_nb_ptr,
-                                       T* grad_inter_w_ptr,
-                                       T* grad_inter_b_ptr,
-                                       T* grad_output_w_ptr,
-                                       T* grad_output_b_ptr,
-                                       T* grad_norm_w_ptr,
-                                       T* grad_norm_b_ptr)
-{
-    cublasSetStream(_cublasHandle, _stream);
-
-    if (!_stochastic_mode) cudaStreamSynchronize(_stream);
-
-    T* workspace = static_cast<T*>(Context::Instance().GetWorkSpace());
-    size_t small_buf_size = bsz * _seq_length * _hidden_size;
-    T* buf_0 = workspace;
-    T* buf_1 = buf_0 + small_buf_size;
-    T* buf_2 = buf_1 + small_buf_size;
-    T* buf_3 = buf_2 + small_buf_size;
-
-    T* ff2_buf = (_gelu_checkpoint ? buf_3 + (bsz * _seq_length * _intermediate_size)
-                                   : buf_3 + small_buf_size);
-    T* ctx_bufB_ptr_recomp = ff2_buf + (_seq_length * _seq_length * bsz * _heads);
-
-    cudaStream_t streams[2] = {_stream, _stream};
-
-    int bsz_seq = bsz * _seq_length;
-    int bsz_heads = bsz * _heads;
-
-    if (!_pre_or_postLayerNorm) {
-        if (_layer_norm.UseMean())
-            _layer_norm.Backward(bsz_seq,
-                                 grad_output_ptr,
-                                 norm_w_ptr,
-                                 grad_norm_w_ptr,
-                                 grad_norm_b_ptr,
-                                 streams,
-                                 buf_1,
-                                 inp_norm_ptr);
-
-        else
-            _layer_norm.Backward(bsz_seq,
-                                 grad_output_ptr,
-                                 norm_w_ptr,
-                                 norm_b_ptr,
-                                 grad_norm_w_ptr,
-                                 grad_norm_b_ptr,
-                                 streams,
-                                 buf_1,
-                                 output_ptr);
-    }
-
-    if (_pre_or_postLayerNorm)
-        _layer_output_dropout.Backward(bsz_seq, buf_0, grad_output_ptr, _stream);
-    else
-        _layer_output_dropout.Backward(bsz_seq, buf_0, buf_1, _stream);
-
-    const T* layer_dropout_buf = _layer_output_dropout.HasDropout()
-                                     ? buf_0
-                                     : (_pre_or_postLayerNorm ? grad_output_ptr : buf_1);
-
-    if (_gelu_checkpoint)
-        _gelu.ForwardWithBiasAdd(bsz_seq, ff2_inp_ptr, inter_b_ptr, buf_2, _stream);
-    _ff2.Backward(bsz_seq,
-                  layer_dropout_buf,
-                  (_gelu_checkpoint ? buf_2 : ff2_inp_ptr),
-                  output_w_ptr,
-                  grad_output_w_ptr,
-                  grad_output_b_ptr,
-                  _cublasHandle,
-                  _stream,
-                  ff2_buf);
-
-    _gelu.Backward(
-        bsz_seq, ff2_buf, (_gelu_checkpoint ? ff2_inp_ptr : gelu_inp_ptr), inter_b_ptr, _stream);
-
-    _ff1.Backward(bsz_seq,
-                  ff2_buf,
-                  ff1_inp_ptr,
-                  inter_w_ptr,
-                  grad_inter_w_ptr,
-                  grad_inter_b_ptr,
-                  _cublasHandle,
-                  _stream,
-                  buf_3);
-
-    if (!_pre_or_postLayerNorm)
-        launch_fused_add2<T>(buf_2, buf_3, buf_1, bsz, _seq_length, _hidden_size, _stream);
-
-    if (_pre_or_postLayerNorm) {
-        if (_attn_layer_norm.UseMean())
-            _attn_layer_norm.BackwardFusedAdd(bsz_seq,
-                                              buf_3,
-                                              grad_output_ptr,
-                                              attn_nw_ptr,
-                                              grad_attn_nw_ptr,
-                                              grad_attn_nb_ptr,
-                                              streams,
-                                              buf_0,
-                                              add_res_ptr);
-
-        else
-            _attn_layer_norm.BackwardFusedAdd(bsz_seq,
-                                              buf_3,
-                                              grad_output_ptr,
-                                              attn_nw_ptr,
-                                              attn_nb_ptr,
-                                              grad_attn_nw_ptr,
-                                              grad_attn_nb_ptr,
-                                              streams,
-                                              buf_0,
-                                              ff1_inp_ptr);
-    } else {
-        if (_attn_layer_norm.UseMean())
-            _attn_layer_norm.Backward(bsz_seq,
-                                      buf_2,
-                                      attn_nw_ptr,
-                                      grad_attn_nw_ptr,
-                                      grad_attn_nb_ptr,
-                                      streams,
-                                      buf_0,
-                                      add_res_ptr);
-
-        else
-            _attn_layer_norm.Backward(bsz_seq,
-                                      buf_2,
-                                      attn_nw_ptr,
-                                      attn_nb_ptr,
-                                      grad_attn_nw_ptr,
-                                      grad_attn_nb_ptr,
-                                      streams,
-                                      buf_0,
-                                      ff1_inp_ptr);
-    }
-
-    _attn_output_dropout.Backward(bsz_seq, buf_2, buf_0, _stream);
-
-    T* attn_output_dropout_buf = _attn_output_dropout.HasDropout() ? buf_2 : buf_0;
-
-    _attn_out_linear.Backward(bsz_seq,
-                              attn_output_dropout_buf,
-                              attn_o_inp_ptr,
-                              attn_ow_ptr,
-                              grad_attn_ow_ptr,
-                              grad_attn_ob_ptr,
-                              _cublasHandle,
-                              _stream,
-                              buf_1);
-
-    launch_transform_0213<T>(buf_2, buf_1, bsz, _seq_length, _hidden_size, _heads, _stream);
-
-    if (_attn_prob_dropout.HasDropout()) {
-        if (_attn_dropout_checkpoint)
-            _attn_prob_dropout.Forward(
-                bsz_heads * _seq_length, ctx_bufB_ptr_recomp, soft_out_ptr, _stream, true);
-
-        _attn_context.Backward(bsz_heads,
-                               buf_2,
-                               v_tf_ptr,
-                               (_attn_dropout_checkpoint ? ctx_bufB_ptr_recomp : ctx_bufB_ptr),
-                               _cublasHandle,
-                               buf_3,
-                               ff2_buf);
-    } else
-        _attn_context.Backward(
-            bsz_heads, buf_2, v_tf_ptr, soft_out_ptr, _cublasHandle, buf_3, ff2_buf);
-
-    _attn_prob_dropout.Backward(bsz_heads * _seq_length, ff2_buf, _stream);
-
-    _softmax.Backward(bsz, ff2_buf, soft_out_ptr, _stream);
-
-    _attn_scores.Backward(bsz_heads, ff2_buf, k_tf_ptr, q_tf_ptr, _cublasHandle, buf_2, buf_1);
-
-    launch_transform4d_0213(ff2_buf, buf_1, bsz, _heads, _seq_length, _hidden_size, _stream, 3);
-
-    if (_pre_or_postLayerNorm)
-        _qkv_linear.Backward(bsz_seq,
-                             ff2_buf,
-                             inp_norm_ptr,
-                             attn_qkvw_ptr,
-                             grad_attn_qkvw_ptr,
-                             grad_attn_qkvb_ptr,
-                             _cublasHandle,
-                             _stream,
-                             buf_2);
-    else
-        _qkv_linear.Backward(bsz_seq,
-                             ff2_buf,
-                             input_ptr,
-                             attn_qkvw_ptr,
-                             grad_attn_qkvw_ptr,
-                             grad_attn_qkvb_ptr,
-                             _cublasHandle,
-                             _stream,
-                             buf_2);
-
-    if (_pre_or_postLayerNorm) {
-        if (_layer_norm.UseMean())
-            _layer_norm.BackwardFusedAdd(bsz_seq,
-                                         buf_2,
-                                         buf_0,
-                                         norm_w_ptr,
-                                         grad_norm_w_ptr,
-                                         grad_norm_b_ptr,
-                                         streams,
-                                         grad_input_ptr,
-                                         input_ptr);
-
-        else
-            _layer_norm.BackwardFusedAdd(bsz_seq,
-                                         buf_2,
-                                         buf_0,
-                                         norm_w_ptr,
-                                         norm_b_ptr,
-                                         grad_norm_w_ptr,
-                                         grad_norm_b_ptr,
-                                         streams,
-                                         grad_input_ptr,
-                                         inp_norm_ptr);
-    } else
-        launch_fused_add2<T>(grad_input_ptr, buf_2, buf_0, bsz, _seq_length, _hidden_size, _stream);
-}
-
-template <typename T>
-void BertTransformerLayer<T>::SetTrainingMode(bool training)
-{
-    // Dropout will be skipped when not in training model.
-    _attn_prob_dropout.SetTrainingMode(training);
-    _attn_output_dropout.SetTrainingMode(training);
-    _layer_output_dropout.SetTrainingMode(training);
-}
-
-template <typename T>
-void BertTransformerLayer<T>::SetIntermediateBuffers(uint8_t* attn_prob_dropout_mask_ptr,
-                                                     uint8_t* attn_output_dropout_mask_ptr,
-                                                     uint8_t* layer_output_dropout_mask_ptr,
-                                                     T* attn_layer_norm_var,
-                                                     T* attn_layer_norm_mean,
-                                                     T* layer_norm_var,
-                                                     T* layer_norm_mean)
-{
-    _attn_prob_dropout.SetMask(attn_prob_dropout_mask_ptr);
-    _attn_output_dropout.SetMask(attn_output_dropout_mask_ptr);
-    _layer_output_dropout.SetMask(layer_output_dropout_mask_ptr);
-
-    _attn_layer_norm.SetVar(attn_layer_norm_var);
-    _attn_layer_norm.SetMean(attn_layer_norm_mean);
-    _layer_norm.SetVar(layer_norm_var);
-    _layer_norm.SetMean(layer_norm_mean);
-}
-
-template <typename T>
-void BertTransformerLayer<T>::SetSeqLength(unsigned seq_len)
-{
-    _seq_length = seq_len;
-
-    _softmax.SetSeqLength(_seq_length);
-    _attn_prob_dropout.SetDimension(_seq_length);
-    _attn_scores.SetConfig(_seq_length, _seq_length, _hidden_size / _heads);
-    _attn_context.SetConfig(_hidden_size / _heads, _seq_length, _seq_length);
-}
-
-template <typename T>
-int create_transformer_layer(unsigned layer_id,
-                             unsigned batch_size,
-                             unsigned hidden_dim,
-                             unsigned num_heads,
-                             unsigned intermediate_size,
-                             float attn_dropout_ratio,
-                             float hidden_dropout_ratio,
-                             float layer_norm_eps,
-                             int seed,
-                             bool pre_or_postLayerNorm,
-                             bool test_gemm,
-                             bool attn_dropout_checkpoint,
-                             bool normalize_invertible,
-                             bool gelu_checkpoint,
-                             bool stochastic_mode)
-{
-    Context::Instance().SetSeed(seed);
-    Context::Instance().TestGemmFP16(
-        test_gemm, batch_size, init_seq_length, num_heads, hidden_dim / num_heads);
-
-    auto layer = std::make_shared<BertTransformerLayer<T>>(layer_id,
-                                                           batch_size,
-                                                           hidden_dim,
-                                                           num_heads,
-                                                           intermediate_size,
-                                                           init_seq_length,
-                                                           attn_dropout_ratio,
-                                                           hidden_dropout_ratio,
-                                                           layer_norm_eps,
-                                                           pre_or_postLayerNorm,
-                                                           Context::Instance().GetGemmAlgos(),
-                                                           attn_dropout_checkpoint,
-                                                           normalize_invertible,
-                                                           gelu_checkpoint,
-                                                           stochastic_mode);
-
-    s_transformer_layers[layer_id] = layer;
-
-    std::string dtype = (std::is_same<T, __half>::value) ? "half" : "float";
-
-    std::cout << "layer #" << layer_id << " is created with date type [" << dtype << "]."
-              << std::endl;
-
-    return 0;
-}
-
-template <typename T>
-std::vector<torch::Tensor> ds_transformer_forward(unsigned layer_id,
-                                                  const torch::Tensor& input,
-                                                  const torch::Tensor& input_mask,
-                                                  const torch::Tensor& attn_qkvw,
-                                                  const torch::Tensor& attn_qkvb,
-                                                  const torch::Tensor& attn_ow,
-                                                  const torch::Tensor& attn_ob,
-                                                  const torch::Tensor& attn_nw,
-                                                  const torch::Tensor& attn_nb,
-                                                  const torch::Tensor& inter_w,
-                                                  const torch::Tensor& inter_b,
-                                                  const torch::Tensor& output_w,
-                                                  const torch::Tensor& output_b,
-                                                  const torch::Tensor& norm_w,
-                                                  const torch::Tensor& norm_b,
-                                                  bool training_mode,
-                                                  bool prelayernorm,
-                                                  bool attn_dropout_checkpoint,
-                                                  bool normalize_invertible,
-                                                  bool gelu_checkpoint)
-{
-    CHECK_INPUT(input);
-    CHECK_INPUT(input_mask);
-    CHECK_INPUT(attn_qkvw);
-    CHECK_INPUT(attn_qkvb);
-    CHECK_INPUT(attn_ow);
-    CHECK_INPUT(attn_ob);
-    CHECK_INPUT(attn_nw);
-    CHECK_INPUT(attn_nb);
-    CHECK_INPUT(inter_w);
-    CHECK_INPUT(inter_b);
-    CHECK_INPUT(output_w);
-    CHECK_INPUT(output_b);
-    CHECK_INPUT(norm_w);
-    CHECK_INPUT(norm_b);
-
-    unsigned bsz = input.size(0);
-
-    const T* input_ptr = (const T*)input.data_ptr();
-    const T* input_mask_ptr = (const T*)input_mask.data_ptr();
-    const T* attn_qkvw_ptr = (const T*)attn_qkvw.data_ptr();
-    const T* attn_qkvb_ptr = (const T*)attn_qkvb.data_ptr();
-    const T* attn_ow_ptr = (const T*)attn_ow.data_ptr();
-    const T* attn_ob_ptr = (const T*)attn_ob.data_ptr();
-    const T* attn_nw_ptr = (const T*)attn_nw.data_ptr();
-    const T* attn_nb_ptr = (const T*)attn_nb.data_ptr();
-    const T* inter_w_ptr = (const T*)inter_w.data_ptr();
-    const T* inter_b_ptr = (const T*)inter_b.data_ptr();
-    const T* output_w_ptr = (const T*)output_w.data_ptr();
-    const T* output_b_ptr = (const T*)output_b.data_ptr();
-    const T* norm_w_ptr = (const T*)norm_w.data_ptr();
-    const T* norm_b_ptr = (const T*)norm_b.data_ptr();
-
-    auto output = torch::empty_like(input);
-    T* out_ptr = (T*)output.data_ptr();
-
-    auto options = torch::TensorOptions()
-                       .dtype(input.options().dtype())
-                       .layout(torch::kStrided)
-                       .device(torch::kCUDA)
-                       .requires_grad(true);
-
-    auto uint8_options = torch::TensorOptions()
-                             .dtype(torch::kInt8)
-                             .layout(torch::kStrided)
-                             .device(torch::kCUDA)
-                             .requires_grad(false);
-
-    std::shared_ptr<BertTransformerLayer<T>> layer =
-        std::static_pointer_cast<BertTransformerLayer<T>>(s_transformer_layers[layer_id]);
-
-    unsigned seq_len = layer->GetSeqLength();
-    if (input.size(1) != seq_len) {
-        seq_len = input.size(1);
-        layer->SetSeqLength(seq_len);
-    }
-
-    auto workspace = torch::empty({get_workspace_size<T>(bsz,
-                                                         seq_len,
-                                                         layer->GetHiddenSize(),
-                                                         layer->GetIntermediateSize(),
-                                                         layer->GetNumHeads(),
-                                                         layer->IsTrainingMode(),
-                                                         layer->GeluCheckpoint())},
-                                  options);
-    Context::Instance().SetWorkSpace((T*)workspace.data_ptr());
-
-    auto inp_norm = ((prelayernorm || !normalize_invertible) ? torch::empty_like(input) : output);
-    auto add_res = (normalize_invertible ? inp_norm : torch::empty_like(input));
-    auto attn_o_inp = torch::empty_like(input);
-    auto qkv_tf = torch::empty({(bsz * seq_len), output_w.size(0) * 3}, options);
-
-    auto attn_prob_dropout_mask =
-        torch::empty({(bsz * layer->GetNumHeads() * seq_len), seq_len}, uint8_options);
-    auto attn_output_dropout_mask =
-        torch::empty({(bsz * seq_len), layer->GetHiddenSize()}, uint8_options);
-    auto layer_output_dropout_mask =
-        torch::empty({(bsz * seq_len), layer->GetHiddenSize()}, uint8_options);
-
-    auto attn_layer_norm_var = torch::empty({(bsz * seq_len)}, options);
-    auto attn_layer_norm_mean = torch::empty({(bsz * seq_len)}, options);
-    auto layer_norm_var = torch::empty({(bsz * seq_len)}, options);
-    auto layer_norm_mean = torch::empty({(bsz * seq_len)}, options);
-
-    T* inp_norm_ptr = (T*)inp_norm.data_ptr();
-    T* add_res_ptr = (T*)add_res.data_ptr();
-    T* q_tf_ptr = (T*)qkv_tf.data_ptr();
-    T* k_tf_ptr = q_tf_ptr + (bsz * seq_len * output_w.size(0));  //(T*)k_tf.data_ptr();
-    T* v_tf_ptr = k_tf_ptr + (bsz * seq_len * output_w.size(0));  //(T*)v_tf.data_ptr();
-    T* attn_o_inp_ptr = (T*)attn_o_inp.data_ptr();
-
-    torch::Tensor ff2_inp = torch::empty({(bsz * seq_len), output_w.size(1)}, options);
-    torch::Tensor gelu_inp =
-        (gelu_checkpoint ? ff2_inp : torch::empty({(bsz * seq_len), output_w.size(1)}, options));
-    auto ff1_inp = torch::empty_like(input);
-    T* ff2_inp_ptr = (T*)ff2_inp.data_ptr();
-    T* gelu_inp_ptr = (T*)gelu_inp.data_ptr();
-    T* ff1_inp_ptr = (T*)ff1_inp.data_ptr();
-
-    torch::Tensor soft_out =
-        torch::empty({(bsz * layer->GetNumHeads() * seq_len), seq_len}, options);
-    torch::Tensor ctx_bufB =
-        (attn_dropout_checkpoint
-             ? soft_out
-             : torch::empty({(bsz * layer->GetNumHeads() * seq_len), seq_len}, options));
-    T* soft_out_ptr = (T*)soft_out.data_ptr();
-    T* ctx_bufB_ptr = (T*)ctx_bufB.data_ptr();
-
-    layer->SetTrainingMode(training_mode);
-    layer->SetIntermediateBuffers((uint8_t*)attn_prob_dropout_mask.data_ptr(),
-                                  (uint8_t*)attn_output_dropout_mask.data_ptr(),
-                                  (uint8_t*)layer_output_dropout_mask.data_ptr(),
-                                  (T*)attn_layer_norm_var.data_ptr(),
-                                  (T*)attn_layer_norm_mean.data_ptr(),
-                                  (T*)layer_norm_var.data_ptr(),
-                                  (T*)layer_norm_mean.data_ptr());
-
-    layer->Forward(bsz,
-                   input_ptr,
-                   input_mask_ptr,
-                   attn_qkvw_ptr,
-                   attn_qkvb_ptr,
-                   attn_ow_ptr,
-                   attn_ob_ptr,
-                   attn_nw_ptr,
-                   attn_nb_ptr,
-                   inter_w_ptr,
-                   inter_b_ptr,
-                   output_w_ptr,
-                   output_b_ptr,
-                   norm_w_ptr,
-                   norm_b_ptr,
-                   out_ptr,
-                   inp_norm_ptr,
-                   q_tf_ptr,
-                   k_tf_ptr,
-                   v_tf_ptr,
-                   soft_out_ptr,
-                   ctx_bufB_ptr,
-                   attn_o_inp_ptr,
-                   add_res_ptr,
-                   ff1_inp_ptr,
-                   gelu_inp_ptr,
-                   ff2_inp_ptr);
-
-    return {output,
-            inp_norm,
-            qkv_tf,
-            soft_out,
-            ctx_bufB,
-            attn_o_inp,
-            add_res,
-            ff1_inp,
-            gelu_inp,
-            ff2_inp,
-            attn_prob_dropout_mask,
-            attn_output_dropout_mask,
-            layer_output_dropout_mask,
-            attn_layer_norm_var,
-            attn_layer_norm_mean,
-            layer_norm_var,
-            layer_norm_mean};
-}
-
-template <typename T>
-std::vector<torch::Tensor> ds_transformer_backward(unsigned layer_id,
-                                                   const torch::Tensor& grad_output,
-                                                   const torch::Tensor& output,
-                                                   const torch::Tensor& inp_norm,
-                                                   const torch::Tensor& qkv_tf,
-                                                   const torch::Tensor& soft_out,
-                                                   const torch::Tensor& ctx_bufB,
-                                                   const torch::Tensor& attn_o_inp,
-                                                   const torch::Tensor& add_res,
-                                                   const torch::Tensor& ff1_inp,
-                                                   const torch::Tensor& gelu_inp,
-                                                   const torch::Tensor& ff2_inp,
-                                                   const torch::Tensor& attn_prob_dropout_mask,
-                                                   const torch::Tensor& attn_output_dropout_mask,
-                                                   const torch::Tensor& layer_output_dropout_mask,
-                                                   const torch::Tensor& attn_layer_norm_var,
-                                                   const torch::Tensor& attn_layer_norm_mean,
-                                                   const torch::Tensor& layer_norm_var,
-                                                   const torch::Tensor& layer_norm_mean,
-                                                   const torch::Tensor& input,
-                                                   const torch::Tensor& input_mask,
-                                                   const torch::Tensor& attn_qkvw,
-                                                   const torch::Tensor& attn_qkvb,
-                                                   const torch::Tensor& attn_ow,
-                                                   const torch::Tensor& attn_ob,
-                                                   const torch::Tensor& attn_nw,
-                                                   const torch::Tensor& attn_nb,
-                                                   const torch::Tensor& inter_w,
-                                                   const torch::Tensor& inter_b,
-                                                   const torch::Tensor& output_w,
-                                                   const torch::Tensor& output_b,
-                                                   const torch::Tensor& norm_w,
-                                                   const torch::Tensor& norm_b)
-{
-    auto g_output = grad_output.contiguous();
-    CHECK_INPUT(g_output);
-    CHECK_INPUT(output);
-    CHECK_INPUT(inp_norm);
-    CHECK_INPUT(qkv_tf);
-    CHECK_INPUT(add_res);
-    CHECK_INPUT(soft_out);
-    CHECK_INPUT(ctx_bufB);
-    CHECK_INPUT(attn_o_inp);
-    CHECK_INPUT(ff1_inp);
-    CHECK_INPUT(gelu_inp);
-    CHECK_INPUT(ff2_inp);
-    CHECK_INPUT(input);
-    CHECK_INPUT(input_mask);
-    CHECK_INPUT(attn_qkvw);
-    CHECK_INPUT(attn_qkvb);
-    CHECK_INPUT(attn_ow);
-    CHECK_INPUT(attn_ob);
-    CHECK_INPUT(attn_nw);
-    CHECK_INPUT(attn_nb);
-    CHECK_INPUT(inter_w);
-    CHECK_INPUT(inter_b);
-    CHECK_INPUT(output_w);
-    CHECK_INPUT(output_b);
-    CHECK_INPUT(norm_w);
-    CHECK_INPUT(norm_b);
-
-    unsigned bsz = g_output.size(0);
-
-    std::shared_ptr<BertTransformerLayer<T>> layer =
-        std::static_pointer_cast<BertTransformerLayer<T>>(s_transformer_layers[layer_id]);
-
-    unsigned seq_len = layer->GetSeqLength();
-    if (g_output.size(1) != seq_len) {
-        seq_len = g_output.size(1);
-        layer->SetSeqLength(seq_len);
-    }
-    auto options = torch::TensorOptions()
-                       .dtype(g_output.options().dtype())
-                       .layout(torch::kStrided)
-                       .device(torch::kCUDA)
-                       .requires_grad(true);
-    auto workspace = torch::empty({get_workspace_size<T>(bsz,
-                                                         seq_len,
-                                                         layer->GetHiddenSize(),
-                                                         layer->GetIntermediateSize(),
-                                                         layer->GetNumHeads(),
-                                                         layer->IsTrainingMode(),
-                                                         layer->GeluCheckpoint())},
-                                  options);
-    Context::Instance().SetWorkSpace((T*)workspace.data_ptr());
-
-    auto grad_input = torch::empty_like(input);
-    auto grad_attn_qkvw = torch::empty_like(attn_qkvw);
-    auto grad_attn_qkvb = torch::empty_like(attn_qkvb);
-    auto grad_attn_ow = torch::empty_like(attn_ow);
-    auto grad_attn_ob = torch::empty_like(attn_ob);
-    auto grad_attn_nw = torch::empty_like(attn_nw);
-    auto grad_attn_nb = torch::empty_like(attn_nb);
-    auto grad_inter_w = torch::empty_like(inter_w);
-    auto grad_inter_b = torch::empty_like(inter_b);
-    auto grad_output_w = torch::empty_like(output_w);
-    auto grad_output_b = torch::empty_like(output_b);
-    auto grad_norm_w = torch::empty_like(norm_w);
-    auto grad_norm_b = torch::empty_like(norm_b);
-
-    // inputs.
-    const T* grad_output_ptr = (const T*)g_output.data_ptr();
-    const T* input_ptr = (const T*)input.data_ptr();
-    const T* output_ptr = (const T*)output.data_ptr();
-    const T* inp_norm_ptr = (const T*)inp_norm.data_ptr();
-    const T* q_tf_ptr = (const T*)qkv_tf.data_ptr();
-    const T* add_res_ptr = (const T*)add_res.data_ptr();
-    const T* k_tf_ptr =
-        q_tf_ptr + (bsz * layer->GetSeqLength() * output_w.size(0));  //(const T*)k_tf.data_ptr();
-    const T* v_tf_ptr =
-        k_tf_ptr + (bsz * layer->GetSeqLength() * output_w.size(0));  //(const T*)v_tf.data_ptr();
-    const T* ff1_inp_ptr = (const T*)ff1_inp.data_ptr();
-    const T* gelu_inp_ptr = (const T*)gelu_inp.data_ptr();
-    const T* ff2_inp_ptr = (const T*)ff2_inp.data_ptr();
-    const T* ctx_bufB_ptr = (const T*)ctx_bufB.data_ptr();
-    const T* soft_out_ptr = (const T*)soft_out.data_ptr();
-    const T* attn_o_inp_ptr = (const T*)attn_o_inp.data_ptr();
-    const T* input_mask_ptr = (const T*)input_mask.data_ptr();
-    const T* attn_qkvw_ptr = (const T*)attn_qkvw.data_ptr();
-    const T* attn_ow_ptr = (const T*)attn_ow.data_ptr();
-    const T* attn_nw_ptr = (const T*)attn_nw.data_ptr();
-    const T* attn_nb_ptr = (const T*)attn_nb.data_ptr();
-    const T* inter_w_ptr = (const T*)inter_w.data_ptr();
-    const T* inter_b_ptr = (const T*)inter_b.data_ptr();
-    const T* output_w_ptr = (const T*)output_w.data_ptr();
-    const T* norm_w_ptr = (const T*)norm_w.data_ptr();
-    const T* norm_b_ptr = (const T*)norm_b.data_ptr();
-
-    // outputs.
-    T* grad_input_ptr = (T*)grad_input.data_ptr();
-    T* grad_attn_qkvw_ptr = (T*)grad_attn_qkvw.data_ptr();
-    T* grad_attn_qkvb_ptr = (T*)grad_attn_qkvb.data_ptr();
-    T* grad_attn_ow_ptr = (T*)grad_attn_ow.data_ptr();
-    T* grad_attn_ob_ptr = (T*)grad_attn_ob.data_ptr();
-    T* grad_attn_nw_ptr = (T*)grad_attn_nw.data_ptr();
-    T* grad_attn_nb_ptr = (T*)grad_attn_nb.data_ptr();
-    T* grad_inter_w_ptr = (T*)grad_inter_w.data_ptr();
-    T* grad_inter_b_ptr = (T*)grad_inter_b.data_ptr();
-    T* grad_output_w_ptr = (T*)grad_output_w.data_ptr();
-    T* grad_output_b_ptr = (T*)grad_output_b.data_ptr();
-    T* grad_norm_w_ptr = (T*)grad_norm_w.data_ptr();
-    T* grad_norm_b_ptr = (T*)grad_norm_b.data_ptr();
-
-    layer->SetIntermediateBuffers((uint8_t*)attn_prob_dropout_mask.data_ptr(),
-                                  (uint8_t*)attn_output_dropout_mask.data_ptr(),
-                                  (uint8_t*)layer_output_dropout_mask.data_ptr(),
-                                  (T*)attn_layer_norm_var.data_ptr(),
-                                  (T*)attn_layer_norm_mean.data_ptr(),
-                                  (T*)layer_norm_var.data_ptr(),
-                                  (T*)layer_norm_mean.data_ptr());
-
-    layer->Backward(bsz,
-                    grad_output_ptr,
-                    input_ptr,
-                    output_ptr,
-                    inp_norm_ptr,
-                    q_tf_ptr,
-                    k_tf_ptr,
-                    v_tf_ptr,
-                    soft_out_ptr,
-                    ctx_bufB_ptr,
-                    attn_o_inp_ptr,
-                    add_res_ptr,
-                    ff1_inp_ptr,
-                    gelu_inp_ptr,
-                    ff2_inp_ptr,
-                    input_mask_ptr,
-                    attn_qkvw_ptr,
-                    attn_ow_ptr,
-                    attn_nw_ptr,
-                    attn_nb_ptr,
-                    inter_w_ptr,
-                    inter_b_ptr,
-                    output_w_ptr,
-                    norm_w_ptr,
-                    norm_b_ptr,
-
-                    grad_input_ptr,
-                    grad_attn_qkvw_ptr,
-                    grad_attn_qkvb_ptr,
-                    grad_attn_ow_ptr,
-                    grad_attn_ob_ptr,
-                    grad_attn_nw_ptr,
-                    grad_attn_nb_ptr,
-                    grad_inter_w_ptr,
-                    grad_inter_b_ptr,
-                    grad_output_w_ptr,
-                    grad_output_b_ptr,
-                    grad_norm_w_ptr,
-                    grad_norm_b_ptr);
-
-    return {grad_input,
-            grad_attn_qkvw,
-            grad_attn_qkvb,
-            grad_attn_ow,
-            grad_attn_ob,
-            grad_attn_nw,
-            grad_attn_nb,
-            grad_inter_w,
-            grad_inter_b,
-            grad_output_w,
-            grad_output_b,
-            grad_norm_w,
-            grad_norm_b};
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("forward_fp32",
-          &ds_transformer_forward<float>,
-          "DeepSpeed Transformer forward with fp32 (CUDA)");
-    m.def("forward_fp16",
-          &ds_transformer_forward<__half>,
-          "DeepSpeed Transformer forward with fp16 (CUDA)");
-    m.def("backward_fp32",
-          &ds_transformer_backward<float>,
-          "DeepSpeed Transformer backward with fp32 (CUDA)");
-    m.def("backward_fp16",
-          &ds_transformer_backward<__half>,
-          "DeepSpeed Transformer backward with fp16 (CUDA)");
-    m.def("create_transformer_layer_fp32",
-          &create_transformer_layer<float>,
-          "Create DeepSpeed Transformer Transformer Layer with fp32 (CUDA)");
-    m.def("create_transformer_layer_fp16",
-          &create_transformer_layer<__half>,
-          "Create DeepSpeed Transformer Transformer Layer with fp16 (CUDA)");
-}
diff --git a/deepspeed/ops/csrc/transformer/ds_transformer_hip.cpp b/deepspeed/ops/csrc/transformer/ds_transformer_hip.cpp
deleted file mode 100644
index f9e0a53a93c1bf9aa04c7a072af00f159e9156a3..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/ds_transformer_hip.cpp
+++ /dev/null
@@ -1,1052 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <torch/extension.h>
-
-#include <rocblas.h>
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime.h>
-#include <type_traits>
-#include <unordered_map>
-#include <vector>
-#include "Timer_hip.h"
-#include "context_hip.h"
-#include "cublas_wrappers_hip.h"
-#include "custom_hip_layers.h"
-#include "ds_transformer_hip.h"
-
-static std::unordered_map<int, std::shared_ptr<void>> s_transformer_layers;
-
-const int init_seq_length = 128;
-
-// C++ interface
-
-template <typename T>
-unsigned get_workspace_size(unsigned maxBatchSize,
-                            unsigned seq_len,
-                            unsigned hidden_size,
-                            unsigned intermediate_size,
-                            unsigned heads,
-                            bool training,
-                            bool gelu_checkpoint)
-{
-    unsigned workSpacesize = 4 * (size_t(maxBatchSize) * seq_len * hidden_size);
-    if (training) {
-        workSpacesize += 2 * (size_t(maxBatchSize) * seq_len * hidden_size);
-        workSpacesize += ((std::max)((size_t(maxBatchSize) * seq_len * intermediate_size),
-                                     2 * (size_t(maxBatchSize) * heads * seq_len * seq_len)));
-        if (gelu_checkpoint)
-            workSpacesize += 2 * (size_t(maxBatchSize) * seq_len * intermediate_size);
-    }
-    return workSpacesize;  // * sizeof(T);
-}
-
-// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) \
-    CHECK_CUDA(x);     \
-    CHECK_CONTIGUOUS(x)
-
-template <typename T>
-BertTransformerLayer<T>::BertTransformerLayer(unsigned layer_id,
-                                              unsigned batch_size,
-                                              unsigned hidden_size,
-                                              unsigned num_heads,
-                                              unsigned intermediate_size,
-                                              unsigned seq_length,
-                                              float attn_prob_dropout_ratio,
-                                              float hidden_output_dropout_ratio,
-                                              float layer_norm_eps,
-                                              bool pre_or_postLayerNorm,
-                                              const std::vector<std::array<int, 3>>& gemm_algos,
-                                              bool attn_dropout_checkpoint,
-                                              bool normalize_invertible,
-                                              bool gelu_checkpoint,
-                                              bool stochastic_mode)
-    : _layer_id(layer_id),
-      _batch_size(batch_size),
-      _hidden_size(hidden_size),
-      _heads(num_heads),
-      _intermediate_size(intermediate_size),
-      _seq_length(seq_length),
-      _training(true),
-      _pre_or_postLayerNorm(pre_or_postLayerNorm),
-      _attn_dropout_checkpoint(attn_dropout_checkpoint),
-      _normalize_invertible(normalize_invertible),
-      _gelu_checkpoint(gelu_checkpoint),
-      _stochastic_mode(stochastic_mode),
-      _stream(Context::Instance().GetCurrentStream()),
-      _cublasHandle(Context::Instance().GetCublasHandle()),
-      _qkv_linear(typename FeedForward<T>::Config(batch_size * seq_length,
-                                                  3 * hidden_size,
-                                                  hidden_size,
-                                                  gemm_algos[0])),
-      _attn_out_linear(typename FeedForward<T>::Config(batch_size * seq_length,
-                                                       hidden_size,
-                                                       hidden_size,
-                                                       gemm_algos[0])),
-      _attn_layer_norm(typename Normalize_Layer<T>::Config(batch_size,
-                                                           seq_length,
-                                                           hidden_size,
-                                                           layer_norm_eps,
-                                                           true,
-                                                           !normalize_invertible)),
-      _layer_norm(typename Normalize_Layer<T>::Config(batch_size,
-                                                      seq_length,
-                                                      hidden_size,
-                                                      layer_norm_eps,
-                                                      true,
-                                                      !normalize_invertible)),
-      _ff1(typename FeedForward<T>::Config(batch_size * seq_length,
-                                           _intermediate_size,
-                                           hidden_size,
-                                           gemm_algos[1])),
-      _ff2(typename FeedForward<T>::Config(batch_size * seq_length,
-                                           hidden_size,
-                                           _intermediate_size,
-                                           gemm_algos[2])),
-      _softmax(typename Softmax<T>::Config(batch_size, num_heads, seq_length)),
-      _gelu(typename Gelu<T>::Config(_intermediate_size)),
-      _attn_prob_dropout(typename Dropout<T>::Config(attn_prob_dropout_ratio, _seq_length)),
-      _attn_output_dropout(typename Dropout<T>::Config(hidden_output_dropout_ratio, _hidden_size)),
-      _layer_output_dropout(typename Dropout<T>::Config(hidden_output_dropout_ratio, _hidden_size)),
-      _attn_scores(typename StridedBatchGemm<T>::Config(_batch_size * _heads,
-                                                        _seq_length,
-                                                        _seq_length,
-                                                        _hidden_size / _heads,
-                                                        //(T(1.0) / T(sqrt(_hidden_size / _heads))),
-                                                        //aiss debug 0506
-                                                        (T(1.0 / (sqrt(_hidden_size / _heads)))),
-                                                        T(0.0),
-                                                        rocblas_operation_transpose,
-                                                        rocblas_operation_none,
-                                                        gemm_algos[3])),
-      _attn_context(typename StridedBatchGemm<T>::Config(_batch_size * _heads,
-                                                         _hidden_size / _heads,
-                                                         _seq_length,
-                                                         _seq_length,
-                                                         T(1.0),
-                                                         T(0.0),
-                                                         rocblas_operation_none,
-                                                         rocblas_operation_none,
-                                                         gemm_algos[4]))
-{
-    assert(_hidden_size % _heads == 0);
-
-    Initialize();
-}
-
-template <typename T>
-BertTransformerLayer<T>::~BertTransformerLayer()
-{
-}
-
-template <typename T>
-void BertTransformerLayer<T>::Initialize()
-{
-#ifndef __HIP_PLATFORM_HCC__
-    if (std::is_same<T, __half>::value) rocblas_set_math_mode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
-#endif
-}
-
-template <typename T>
-void BertTransformerLayer<T>::Forward(unsigned bsz,
-                                      const T* input_ptr,
-                                      const T* input_mask_ptr,
-                                      const T* attn_qkvw_ptr,
-                                      const T* attn_qkvb_ptr,
-                                      const T* attn_ow_ptr,
-                                      const T* attn_ob_ptr,
-                                      const T* attn_nw_ptr,
-                                      const T* attn_nb_ptr,
-                                      const T* inter_w_ptr,
-                                      const T* inter_b_ptr,
-                                      const T* output_w_ptr,
-                                      const T* output_b_ptr,
-                                      const T* norm_w_ptr,
-                                      const T* norm_b_ptr,
-                                      T* out_ptr,
-                                      T* inp_norm_ptr,
-                                      T* q_tf_ptr,
-                                      T* k_tf_ptr,
-                                      T* v_tf_ptr,
-                                      T* soft_out_ptr,
-                                      T* ctx_bufB_ptr,
-                                      T* attn_o_inp_ptr,
-                                      T* add_res_ptr,
-                                      T* ff1_inp_ptr,
-                                      T* gelu_inp_ptr,
-                                      T* ff2_inp_ptr)
-{
-    rocblas_set_stream(_cublasHandle, _stream);
-
-    if (!_stochastic_mode) hipStreamSynchronize(_stream);
-
-    T* workspace = static_cast<T*>(Context::Instance().GetWorkSpace());
-    size_t small_buf_size = bsz * _seq_length * _hidden_size;
-    T* buf_0 = workspace;
-    T* buf_1 = buf_0 + small_buf_size;
-    T* buf_2 = buf_1;
-
-    if (_normalize_invertible) {
-        add_res_ptr = buf_1 + 3 * small_buf_size;
-        buf_2 = add_res_ptr;
-    }
-    if (_gelu_checkpoint) buf_2 += small_buf_size;
-    if (_attn_dropout_checkpoint)
-        ctx_bufB_ptr =
-            (_gelu_checkpoint ? (buf_2 + (_intermediate_size / _hidden_size) * small_buf_size)
-                              : (buf_1 + 4 * small_buf_size));
-
-    int bsz_seq = bsz * _seq_length;
-
-    if (_pre_or_postLayerNorm) {
-        if (_layer_norm.UseMean())
-            _layer_norm.ForwardCheckpoint(
-                bsz_seq, inp_norm_ptr, input_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
-
-        else
-            _layer_norm.Forward(
-                bsz_seq, inp_norm_ptr, input_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
-    }
-
-    if (_pre_or_postLayerNorm)
-        _qkv_linear.Forward(bsz_seq, inp_norm_ptr, attn_qkvw_ptr, buf_0, _cublasHandle);
-    else
-        _qkv_linear.Forward(bsz_seq, input_ptr, attn_qkvw_ptr, buf_0, _cublasHandle);
-
-    launch_bias_add_transform_0213<T>(
-        q_tf_ptr, buf_0, attn_qkvb_ptr, bsz, _seq_length, _hidden_size, _heads, _stream, 3);
-
-    int bsz_heads = bsz * _heads;
-
-    // attention scores
-    _attn_scores.Forward(bsz_heads, soft_out_ptr, k_tf_ptr, q_tf_ptr, _cublasHandle);
-
-    // Softmax + Mask
-    _softmax.Forward(bsz, soft_out_ptr, input_mask_ptr, _stream);
-
-    // attn prob dropout.
-    _attn_prob_dropout.Forward(bsz_heads * _seq_length, ctx_bufB_ptr, soft_out_ptr, _stream);
-
-    // attention context
-    _attn_context.Forward(bsz_heads, buf_1, v_tf_ptr, ctx_bufB_ptr, _cublasHandle);
-
-    launch_transform4d_0213<T>(
-        attn_o_inp_ptr, buf_1, bsz, _heads, _seq_length, _hidden_size, _stream, 1);
-
-    if (_pre_or_postLayerNorm)
-        _attn_out_linear.Forward(bsz_seq, attn_o_inp_ptr, attn_ow_ptr, buf_1, _cublasHandle);
-    else
-        _attn_out_linear.Forward(bsz_seq, attn_o_inp_ptr, attn_ow_ptr, ff1_inp_ptr, _cublasHandle);
-
-    // attn output dropout.
-    if (_pre_or_postLayerNorm)
-        _attn_output_dropout.ForwardWithBias(
-            bsz_seq, add_res_ptr, buf_1, input_ptr, attn_ob_ptr, _stream);
-    else
-        _attn_output_dropout.ForwardWithBias(
-            bsz_seq, add_res_ptr, ff1_inp_ptr, input_ptr, attn_ob_ptr, _stream);
-
-    if (_pre_or_postLayerNorm) {
-        if (_attn_layer_norm.UseMean())
-            _attn_layer_norm.ForwardCheckpoint(
-                bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
-        else
-            _attn_layer_norm.Forward(
-                bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
-    } else {
-        if (_attn_layer_norm.UseMean())
-            _attn_layer_norm.ForwardCheckpoint(
-                bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
-        else
-            _attn_layer_norm.Forward(
-                bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
-    }
-
-    _ff1.Forward(bsz_seq,
-                 ff1_inp_ptr,
-                 inter_w_ptr,
-                 (_gelu_checkpoint ? ff2_inp_ptr : gelu_inp_ptr),
-                 _cublasHandle);
-
-    _gelu.ForwardWithBiasAdd(bsz_seq,
-                             (_gelu_checkpoint ? ff2_inp_ptr : gelu_inp_ptr),
-                             inter_b_ptr,
-                             (_gelu_checkpoint ? buf_2 : ff2_inp_ptr),
-                             _stream);
-
-    _ff2.Forward(
-        bsz_seq, (_gelu_checkpoint ? buf_2 : ff2_inp_ptr), output_w_ptr, out_ptr, _cublasHandle);
-
-    // layer output dropout.
-    if (_pre_or_postLayerNorm)
-        _layer_output_dropout.ForwardWithBias(
-            bsz_seq, out_ptr, out_ptr, add_res_ptr, output_b_ptr, _stream);
-    else
-        _layer_output_dropout.ForwardWithBias(
-            bsz_seq, inp_norm_ptr, out_ptr, ff1_inp_ptr, output_b_ptr, _stream);
-
-    if (!_pre_or_postLayerNorm) {
-        if (_layer_norm.UseMean())
-            _layer_norm.ForwardCheckpoint(
-                bsz_seq, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
-        else
-            _layer_norm.Forward(
-                bsz_seq, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
-    }
-}
-
-template <typename T>
-void BertTransformerLayer<T>::Backward(unsigned bsz,
-                                       const T* grad_output_ptr,
-                                       const T* input_ptr,
-                                       const T* output_ptr,
-                                       const T* inp_norm_ptr,
-                                       const T* q_tf_ptr,
-                                       const T* k_tf_ptr,
-                                       const T* v_tf_ptr,
-                                       const T* soft_out_ptr,
-                                       const T* ctx_bufB_ptr,
-                                       const T* attn_o_inp_ptr,
-                                       const T* add_res_ptr,
-                                       const T* ff1_inp_ptr,
-                                       const T* gelu_inp_ptr,
-                                       const T* ff2_inp_ptr,
-                                       const T* input_mask_ptr,
-                                       const T* attn_qkvw_ptr,
-                                       const T* attn_ow_ptr,
-                                       const T* attn_nw_ptr,
-                                       const T* attn_nb_ptr,
-                                       const T* inter_w_ptr,
-                                       const T* inter_b_ptr,
-                                       const T* output_w_ptr,
-                                       const T* norm_w_ptr,
-                                       const T* norm_b_ptr,
-
-                                       T* grad_input_ptr,
-                                       T* grad_attn_qkvw_ptr,
-                                       T* grad_attn_qkvb_ptr,
-                                       T* grad_attn_ow_ptr,
-                                       T* grad_attn_ob_ptr,
-                                       T* grad_attn_nw_ptr,
-                                       T* grad_attn_nb_ptr,
-                                       T* grad_inter_w_ptr,
-                                       T* grad_inter_b_ptr,
-                                       T* grad_output_w_ptr,
-                                       T* grad_output_b_ptr,
-                                       T* grad_norm_w_ptr,
-                                       T* grad_norm_b_ptr)
-{
-    rocblas_set_stream(_cublasHandle, _stream);
-
-    if (!_stochastic_mode) hipStreamSynchronize(_stream);
-
-    T* workspace = static_cast<T*>(Context::Instance().GetWorkSpace());
-    size_t small_buf_size = bsz * _seq_length * _hidden_size;
-    T* buf_0 = workspace;
-    T* buf_1 = buf_0 + small_buf_size;
-    T* buf_2 = buf_1 + small_buf_size;
-    T* buf_3 = buf_2 + small_buf_size;
-
-    T* ff2_buf = (_gelu_checkpoint ? buf_3 + (bsz * _seq_length * _intermediate_size)
-                                   : buf_3 + small_buf_size);
-    T* ctx_bufB_ptr_recomp = ff2_buf + (_seq_length * _seq_length * bsz * _heads);
-
-    hipStream_t streams[2] = {_stream, _stream};
-
-    int bsz_seq = bsz * _seq_length;
-    int bsz_heads = bsz * _heads;
-
-    if (!_pre_or_postLayerNorm) {
-        if (_layer_norm.UseMean())
-            _layer_norm.Backward(bsz_seq,
-                                 grad_output_ptr,
-                                 norm_w_ptr,
-                                 grad_norm_w_ptr,
-                                 grad_norm_b_ptr,
-                                 streams,
-                                 buf_1,
-                                 inp_norm_ptr);
-
-        else
-            _layer_norm.Backward(bsz_seq,
-                                 grad_output_ptr,
-                                 norm_w_ptr,
-                                 norm_b_ptr,
-                                 grad_norm_w_ptr,
-                                 grad_norm_b_ptr,
-                                 streams,
-                                 buf_1,
-                                 output_ptr);
-    }
-
-    if (_pre_or_postLayerNorm)
-        _layer_output_dropout.Backward(bsz_seq, buf_0, grad_output_ptr, _stream);
-    else
-        _layer_output_dropout.Backward(bsz_seq, buf_0, buf_1, _stream);
-
-    const T* layer_dropout_buf = _layer_output_dropout.HasDropout()
-                                     ? buf_0
-                                     : (_pre_or_postLayerNorm ? grad_output_ptr : buf_1);
-
-    if (_gelu_checkpoint)
-        _gelu.ForwardWithBiasAdd(bsz_seq, ff2_inp_ptr, inter_b_ptr, buf_2, _stream);
-    _ff2.Backward(bsz_seq,
-                  layer_dropout_buf,
-                  (_gelu_checkpoint ? buf_2 : ff2_inp_ptr),
-                  output_w_ptr,
-                  grad_output_w_ptr,
-                  grad_output_b_ptr,
-                  _cublasHandle,
-                  _stream,
-                  ff2_buf);
-
-    _gelu.Backward(
-        bsz_seq, ff2_buf, (_gelu_checkpoint ? ff2_inp_ptr : gelu_inp_ptr), inter_b_ptr, _stream);
-
-    _ff1.Backward(bsz_seq,
-                  ff2_buf,
-                  ff1_inp_ptr,
-                  inter_w_ptr,
-                  grad_inter_w_ptr,
-                  grad_inter_b_ptr,
-                  _cublasHandle,
-                  _stream,
-                  buf_3);
-
-    if (!_pre_or_postLayerNorm)
-        launch_fused_add2<T>(buf_2, buf_3, buf_1, bsz, _seq_length, _hidden_size, _stream);
-
-    if (_pre_or_postLayerNorm) {
-        if (_attn_layer_norm.UseMean())
-            _attn_layer_norm.BackwardFusedAdd(bsz_seq,
-                                              buf_3,
-                                              grad_output_ptr,
-                                              attn_nw_ptr,
-                                              grad_attn_nw_ptr,
-                                              grad_attn_nb_ptr,
-                                              streams,
-                                              buf_0,
-                                              add_res_ptr);
-
-        else
-            _attn_layer_norm.BackwardFusedAdd(bsz_seq,
-                                              buf_3,
-                                              grad_output_ptr,
-                                              attn_nw_ptr,
-                                              attn_nb_ptr,
-                                              grad_attn_nw_ptr,
-                                              grad_attn_nb_ptr,
-                                              streams,
-                                              buf_0,
-                                              ff1_inp_ptr);
-    } else {
-        if (_attn_layer_norm.UseMean())
-            _attn_layer_norm.Backward(bsz_seq,
-                                      buf_2,
-                                      attn_nw_ptr,
-                                      grad_attn_nw_ptr,
-                                      grad_attn_nb_ptr,
-                                      streams,
-                                      buf_0,
-                                      add_res_ptr);
-
-        else
-            _attn_layer_norm.Backward(bsz_seq,
-                                      buf_2,
-                                      attn_nw_ptr,
-                                      attn_nb_ptr,
-                                      grad_attn_nw_ptr,
-                                      grad_attn_nb_ptr,
-                                      streams,
-                                      buf_0,
-                                      ff1_inp_ptr);
-    }
-
-    _attn_output_dropout.Backward(bsz_seq, buf_2, buf_0, _stream);
-
-    T* attn_output_dropout_buf = _attn_output_dropout.HasDropout() ? buf_2 : buf_0;
-
-    _attn_out_linear.Backward(bsz_seq,
-                              attn_output_dropout_buf,
-                              attn_o_inp_ptr,
-                              attn_ow_ptr,
-                              grad_attn_ow_ptr,
-                              grad_attn_ob_ptr,
-                              _cublasHandle,
-                              _stream,
-                              buf_1);
-
-    launch_transform_0213<T>(buf_2, buf_1, bsz, _seq_length, _hidden_size, _heads, _stream);
-
-    if (_attn_prob_dropout.HasDropout()) {
-        if (_attn_dropout_checkpoint)
-            _attn_prob_dropout.Forward(
-                bsz_heads * _seq_length, ctx_bufB_ptr_recomp, soft_out_ptr, _stream, true);
-
-        _attn_context.Backward(bsz_heads,
-                               buf_2,
-                               v_tf_ptr,
-                               (_attn_dropout_checkpoint ? ctx_bufB_ptr_recomp : ctx_bufB_ptr),
-                               _cublasHandle,
-                               buf_3,
-                               ff2_buf);
-    } else
-        _attn_context.Backward(
-            bsz_heads, buf_2, v_tf_ptr, soft_out_ptr, _cublasHandle, buf_3, ff2_buf);
-
-    _attn_prob_dropout.Backward(bsz_heads * _seq_length, ff2_buf, _stream);
-
-    _softmax.Backward(bsz, ff2_buf, soft_out_ptr, _stream);
-
-    _attn_scores.Backward(bsz_heads, ff2_buf, k_tf_ptr, q_tf_ptr, _cublasHandle, buf_2, buf_1);
-
-    launch_transform4d_0213(ff2_buf, buf_1, bsz, _heads, _seq_length, _hidden_size, _stream, 3);
-
-    if (_pre_or_postLayerNorm)
-        _qkv_linear.Backward(bsz_seq,
-                             ff2_buf,
-                             inp_norm_ptr,
-                             attn_qkvw_ptr,
-                             grad_attn_qkvw_ptr,
-                             grad_attn_qkvb_ptr,
-                             _cublasHandle,
-                             _stream,
-                             buf_2);
-    else
-        _qkv_linear.Backward(bsz_seq,
-                             ff2_buf,
-                             input_ptr,
-                             attn_qkvw_ptr,
-                             grad_attn_qkvw_ptr,
-                             grad_attn_qkvb_ptr,
-                             _cublasHandle,
-                             _stream,
-                             buf_2);
-
-    if (_pre_or_postLayerNorm) {
-        if (_layer_norm.UseMean())
-            _layer_norm.BackwardFusedAdd(bsz_seq,
-                                         buf_2,
-                                         buf_0,
-                                         norm_w_ptr,
-                                         grad_norm_w_ptr,
-                                         grad_norm_b_ptr,
-                                         streams,
-                                         grad_input_ptr,
-                                         input_ptr);
-
-        else
-            _layer_norm.BackwardFusedAdd(bsz_seq,
-                                         buf_2,
-                                         buf_0,
-                                         norm_w_ptr,
-                                         norm_b_ptr,
-                                         grad_norm_w_ptr,
-                                         grad_norm_b_ptr,
-                                         streams,
-                                         grad_input_ptr,
-                                         inp_norm_ptr);
-    } else
-        launch_fused_add2<T>(grad_input_ptr, buf_2, buf_0, bsz, _seq_length, _hidden_size, _stream);
-}
-
-template <typename T>
-void BertTransformerLayer<T>::SetTrainingMode(bool training)
-{
-    // Dropout will be skipped when not in training model.
-    _attn_prob_dropout.SetTrainingMode(training);
-    _attn_output_dropout.SetTrainingMode(training);
-    _layer_output_dropout.SetTrainingMode(training);
-}
-
-template <typename T>
-void BertTransformerLayer<T>::SetIntermediateBuffers(uint8_t* attn_prob_dropout_mask_ptr,
-                                                     uint8_t* attn_output_dropout_mask_ptr,
-                                                     uint8_t* layer_output_dropout_mask_ptr,
-                                                     T* attn_layer_norm_var,
-                                                     T* attn_layer_norm_mean,
-                                                     T* layer_norm_var,
-                                                     T* layer_norm_mean)
-{
-    _attn_prob_dropout.SetMask(attn_prob_dropout_mask_ptr);
-    _attn_output_dropout.SetMask(attn_output_dropout_mask_ptr);
-    _layer_output_dropout.SetMask(layer_output_dropout_mask_ptr);
-
-    _attn_layer_norm.SetVar(attn_layer_norm_var);
-    _attn_layer_norm.SetMean(attn_layer_norm_mean);
-    _layer_norm.SetVar(layer_norm_var);
-    _layer_norm.SetMean(layer_norm_mean);
-}
-
-template <typename T>
-void BertTransformerLayer<T>::SetSeqLength(unsigned seq_len)
-{
-    _seq_length = seq_len;
-
-    _softmax.SetSeqLength(_seq_length);
-    _attn_prob_dropout.SetDimension(_seq_length);
-    _attn_scores.SetConfig(_seq_length, _seq_length, _hidden_size / _heads);
-    _attn_context.SetConfig(_hidden_size / _heads, _seq_length, _seq_length);
-}
-
-template <typename T>
-int create_transformer_layer(unsigned layer_id,
-                             unsigned batch_size,
-                             unsigned hidden_dim,
-                             unsigned num_heads,
-                             unsigned intermediate_size,
-                             float attn_dropout_ratio,
-                             float hidden_dropout_ratio,
-                             float layer_norm_eps,
-                             int seed,
-                             bool pre_or_postLayerNorm,
-                             bool test_gemm,
-                             bool attn_dropout_checkpoint,
-                             bool normalize_invertible,
-                             bool gelu_checkpoint,
-                             bool stochastic_mode)
-{
-    Context::Instance().SetSeed(seed);
-    Context::Instance().TestGemmFP16(
-        test_gemm, batch_size, init_seq_length, num_heads, hidden_dim / num_heads);
-
-    auto layer = std::make_shared<BertTransformerLayer<T>>(layer_id,
-                                                           batch_size,
-                                                           hidden_dim,
-                                                           num_heads,
-                                                           intermediate_size,
-                                                           init_seq_length,
-                                                           attn_dropout_ratio,
-                                                           hidden_dropout_ratio,
-                                                           layer_norm_eps,
-                                                           pre_or_postLayerNorm,
-                                                           Context::Instance().GetGemmAlgos(),
-                                                           attn_dropout_checkpoint,
-                                                           normalize_invertible,
-                                                           gelu_checkpoint,
-                                                           stochastic_mode);
-
-    s_transformer_layers[layer_id] = layer;
-
-    std::string dtype = (std::is_same<T, __half>::value) ? "half" : "float";
-
-    std::cout << "layer #" << layer_id << " is created with date type [" << dtype << "]."
-              << std::endl;
-
-    return 0;
-}
-
-template <typename T>
-std::vector<torch::Tensor> ds_transformer_forward(unsigned layer_id,
-                                                  const torch::Tensor& input,
-                                                  const torch::Tensor& input_mask,
-                                                  const torch::Tensor& attn_qkvw,
-                                                  const torch::Tensor& attn_qkvb,
-                                                  const torch::Tensor& attn_ow,
-                                                  const torch::Tensor& attn_ob,
-                                                  const torch::Tensor& attn_nw,
-                                                  const torch::Tensor& attn_nb,
-                                                  const torch::Tensor& inter_w,
-                                                  const torch::Tensor& inter_b,
-                                                  const torch::Tensor& output_w,
-                                                  const torch::Tensor& output_b,
-                                                  const torch::Tensor& norm_w,
-                                                  const torch::Tensor& norm_b,
-                                                  bool training_mode,
-                                                  bool prelayernorm,
-                                                  bool attn_dropout_checkpoint,
-                                                  bool normalize_invertible,
-                                                  bool gelu_checkpoint)
-{
-    CHECK_INPUT(input);
-    CHECK_INPUT(input_mask);
-    CHECK_INPUT(attn_qkvw);
-    CHECK_INPUT(attn_qkvb);
-    CHECK_INPUT(attn_ow);
-    CHECK_INPUT(attn_ob);
-    CHECK_INPUT(attn_nw);
-    CHECK_INPUT(attn_nb);
-    CHECK_INPUT(inter_w);
-    CHECK_INPUT(inter_b);
-    CHECK_INPUT(output_w);
-    CHECK_INPUT(output_b);
-    CHECK_INPUT(norm_w);
-    CHECK_INPUT(norm_b);
-
-    unsigned bsz = input.size(0);
-
-    const T* input_ptr = (const T*)input.data_ptr();
-    const T* input_mask_ptr = (const T*)input_mask.data_ptr();
-    const T* attn_qkvw_ptr = (const T*)attn_qkvw.data_ptr();
-    const T* attn_qkvb_ptr = (const T*)attn_qkvb.data_ptr();
-    const T* attn_ow_ptr = (const T*)attn_ow.data_ptr();
-    const T* attn_ob_ptr = (const T*)attn_ob.data_ptr();
-    const T* attn_nw_ptr = (const T*)attn_nw.data_ptr();
-    const T* attn_nb_ptr = (const T*)attn_nb.data_ptr();
-    const T* inter_w_ptr = (const T*)inter_w.data_ptr();
-    const T* inter_b_ptr = (const T*)inter_b.data_ptr();
-    const T* output_w_ptr = (const T*)output_w.data_ptr();
-    const T* output_b_ptr = (const T*)output_b.data_ptr();
-    const T* norm_w_ptr = (const T*)norm_w.data_ptr();
-    const T* norm_b_ptr = (const T*)norm_b.data_ptr();
-
-    auto output = torch::empty_like(input);
-    T* out_ptr = (T*)output.data_ptr();
-
-    auto options = torch::TensorOptions()
-                       .dtype(input.options().dtype())
-                       .layout(torch::kStrided)
-                       .device(torch::kCUDA)
-                       .requires_grad(true);
-
-    auto uint8_options = torch::TensorOptions()
-                             .dtype(torch::kInt8)
-                             .layout(torch::kStrided)
-                             .device(torch::kCUDA)
-                             .requires_grad(false);
-
-    std::shared_ptr<BertTransformerLayer<T>> layer =
-        std::static_pointer_cast<BertTransformerLayer<T>>(s_transformer_layers[layer_id]);
-
-    unsigned seq_len = layer->GetSeqLength();
-    if (input.size(1) != seq_len) {
-        seq_len = input.size(1);
-        layer->SetSeqLength(seq_len);
-    }
-
-    auto workspace = torch::empty({get_workspace_size<T>(bsz,
-                                                         seq_len,
-                                                         layer->GetHiddenSize(),
-                                                         layer->GetIntermediateSize(),
-                                                         layer->GetNumHeads(),
-                                                         layer->IsTrainingMode(),
-                                                         layer->GeluCheckpoint())},
-                                  options);
-    Context::Instance().SetWorkSpace((T*)workspace.data_ptr());
-
-    auto inp_norm = ((prelayernorm || !normalize_invertible) ? torch::empty_like(input) : output);
-    auto add_res = (normalize_invertible ? inp_norm : torch::empty_like(input));
-    auto attn_o_inp = torch::empty_like(input);
-    auto qkv_tf = torch::empty({(bsz * seq_len), output_w.size(0) * 3}, options);
-
-    auto attn_prob_dropout_mask =
-        torch::empty({(bsz * layer->GetNumHeads() * seq_len), seq_len}, uint8_options);
-    auto attn_output_dropout_mask =
-        torch::empty({(bsz * seq_len), layer->GetHiddenSize()}, uint8_options);
-    auto layer_output_dropout_mask =
-        torch::empty({(bsz * seq_len), layer->GetHiddenSize()}, uint8_options);
-
-    auto attn_layer_norm_var = torch::empty({(bsz * seq_len)}, options);
-    auto attn_layer_norm_mean = torch::empty({(bsz * seq_len)}, options);
-    auto layer_norm_var = torch::empty({(bsz * seq_len)}, options);
-    auto layer_norm_mean = torch::empty({(bsz * seq_len)}, options);
-
-    T* inp_norm_ptr = (T*)inp_norm.data_ptr();
-    T* add_res_ptr = (T*)add_res.data_ptr();
-    T* q_tf_ptr = (T*)qkv_tf.data_ptr();
-    T* k_tf_ptr = q_tf_ptr + (bsz * seq_len * output_w.size(0));  //(T*)k_tf.data_ptr();
-    T* v_tf_ptr = k_tf_ptr + (bsz * seq_len * output_w.size(0));  //(T*)v_tf.data_ptr();
-    T* attn_o_inp_ptr = (T*)attn_o_inp.data_ptr();
-
-    torch::Tensor ff2_inp = torch::empty({(bsz * seq_len), output_w.size(1)}, options);
-    torch::Tensor gelu_inp =
-        (gelu_checkpoint ? ff2_inp : torch::empty({(bsz * seq_len), output_w.size(1)}, options));
-    auto ff1_inp = torch::empty_like(input);
-    T* ff2_inp_ptr = (T*)ff2_inp.data_ptr();
-    T* gelu_inp_ptr = (T*)gelu_inp.data_ptr();
-    T* ff1_inp_ptr = (T*)ff1_inp.data_ptr();
-
-    torch::Tensor soft_out =
-        torch::empty({(bsz * layer->GetNumHeads() * seq_len), seq_len}, options);
-    torch::Tensor ctx_bufB =
-        (attn_dropout_checkpoint
-             ? soft_out
-             : torch::empty({(bsz * layer->GetNumHeads() * seq_len), seq_len}, options));
-    T* soft_out_ptr = (T*)soft_out.data_ptr();
-    T* ctx_bufB_ptr = (T*)ctx_bufB.data_ptr();
-
-    layer->SetTrainingMode(training_mode);
-    layer->SetIntermediateBuffers((uint8_t*)attn_prob_dropout_mask.data_ptr(),
-                                  (uint8_t*)attn_output_dropout_mask.data_ptr(),
-                                  (uint8_t*)layer_output_dropout_mask.data_ptr(),
-                                  (T*)attn_layer_norm_var.data_ptr(),
-                                  (T*)attn_layer_norm_mean.data_ptr(),
-                                  (T*)layer_norm_var.data_ptr(),
-                                  (T*)layer_norm_mean.data_ptr());
-
-    layer->Forward(bsz,
-                   input_ptr,
-                   input_mask_ptr,
-                   attn_qkvw_ptr,
-                   attn_qkvb_ptr,
-                   attn_ow_ptr,
-                   attn_ob_ptr,
-                   attn_nw_ptr,
-                   attn_nb_ptr,
-                   inter_w_ptr,
-                   inter_b_ptr,
-                   output_w_ptr,
-                   output_b_ptr,
-                   norm_w_ptr,
-                   norm_b_ptr,
-                   out_ptr,
-                   inp_norm_ptr,
-                   q_tf_ptr,
-                   k_tf_ptr,
-                   v_tf_ptr,
-                   soft_out_ptr,
-                   ctx_bufB_ptr,
-                   attn_o_inp_ptr,
-                   add_res_ptr,
-                   ff1_inp_ptr,
-                   gelu_inp_ptr,
-                   ff2_inp_ptr);
-
-    return {output,
-            inp_norm,
-            qkv_tf,
-            soft_out,
-            ctx_bufB,
-            attn_o_inp,
-            add_res,
-            ff1_inp,
-            gelu_inp,
-            ff2_inp,
-            attn_prob_dropout_mask,
-            attn_output_dropout_mask,
-            layer_output_dropout_mask,
-            attn_layer_norm_var,
-            attn_layer_norm_mean,
-            layer_norm_var,
-            layer_norm_mean};
-}
-
-template <typename T>
-std::vector<torch::Tensor> ds_transformer_backward(unsigned layer_id,
-                                                   const torch::Tensor& grad_output,
-                                                   const torch::Tensor& output,
-                                                   const torch::Tensor& inp_norm,
-                                                   const torch::Tensor& qkv_tf,
-                                                   const torch::Tensor& soft_out,
-                                                   const torch::Tensor& ctx_bufB,
-                                                   const torch::Tensor& attn_o_inp,
-                                                   const torch::Tensor& add_res,
-                                                   const torch::Tensor& ff1_inp,
-                                                   const torch::Tensor& gelu_inp,
-                                                   const torch::Tensor& ff2_inp,
-                                                   const torch::Tensor& attn_prob_dropout_mask,
-                                                   const torch::Tensor& attn_output_dropout_mask,
-                                                   const torch::Tensor& layer_output_dropout_mask,
-                                                   const torch::Tensor& attn_layer_norm_var,
-                                                   const torch::Tensor& attn_layer_norm_mean,
-                                                   const torch::Tensor& layer_norm_var,
-                                                   const torch::Tensor& layer_norm_mean,
-                                                   const torch::Tensor& input,
-                                                   const torch::Tensor& input_mask,
-                                                   const torch::Tensor& attn_qkvw,
-                                                   const torch::Tensor& attn_qkvb,
-                                                   const torch::Tensor& attn_ow,
-                                                   const torch::Tensor& attn_ob,
-                                                   const torch::Tensor& attn_nw,
-                                                   const torch::Tensor& attn_nb,
-                                                   const torch::Tensor& inter_w,
-                                                   const torch::Tensor& inter_b,
-                                                   const torch::Tensor& output_w,
-                                                   const torch::Tensor& output_b,
-                                                   const torch::Tensor& norm_w,
-                                                   const torch::Tensor& norm_b)
-{
-    auto g_output = grad_output.contiguous();
-    CHECK_INPUT(g_output);
-    CHECK_INPUT(output);
-    CHECK_INPUT(inp_norm);
-    CHECK_INPUT(qkv_tf);
-    CHECK_INPUT(add_res);
-    CHECK_INPUT(soft_out);
-    CHECK_INPUT(ctx_bufB);
-    CHECK_INPUT(attn_o_inp);
-    CHECK_INPUT(ff1_inp);
-    CHECK_INPUT(gelu_inp);
-    CHECK_INPUT(ff2_inp);
-    CHECK_INPUT(input);
-    CHECK_INPUT(input_mask);
-    CHECK_INPUT(attn_qkvw);
-    CHECK_INPUT(attn_qkvb);
-    CHECK_INPUT(attn_ow);
-    CHECK_INPUT(attn_ob);
-    CHECK_INPUT(attn_nw);
-    CHECK_INPUT(attn_nb);
-    CHECK_INPUT(inter_w);
-    CHECK_INPUT(inter_b);
-    CHECK_INPUT(output_w);
-    CHECK_INPUT(output_b);
-    CHECK_INPUT(norm_w);
-    CHECK_INPUT(norm_b);
-
-    unsigned bsz = g_output.size(0);
-
-    std::shared_ptr<BertTransformerLayer<T>> layer =
-        std::static_pointer_cast<BertTransformerLayer<T>>(s_transformer_layers[layer_id]);
-
-    unsigned seq_len = layer->GetSeqLength();
-    if (g_output.size(1) != seq_len) {
-        seq_len = g_output.size(1);
-        layer->SetSeqLength(seq_len);
-    }
-    auto options = torch::TensorOptions()
-                       .dtype(g_output.options().dtype())
-                       .layout(torch::kStrided)
-                       .device(torch::kCUDA)
-                       .requires_grad(true);
-    auto workspace = torch::empty({get_workspace_size<T>(bsz,
-                                                         seq_len,
-                                                         layer->GetHiddenSize(),
-                                                         layer->GetIntermediateSize(),
-                                                         layer->GetNumHeads(),
-                                                         layer->IsTrainingMode(),
-                                                         layer->GeluCheckpoint())},
-                                  options);
-    Context::Instance().SetWorkSpace((T*)workspace.data_ptr());
-
-    auto grad_input = torch::empty_like(input);
-    auto grad_attn_qkvw = torch::empty_like(attn_qkvw);
-    auto grad_attn_qkvb = torch::empty_like(attn_qkvb);
-    auto grad_attn_ow = torch::empty_like(attn_ow);
-    auto grad_attn_ob = torch::empty_like(attn_ob);
-    auto grad_attn_nw = torch::empty_like(attn_nw);
-    auto grad_attn_nb = torch::empty_like(attn_nb);
-    auto grad_inter_w = torch::empty_like(inter_w);
-    auto grad_inter_b = torch::empty_like(inter_b);
-    auto grad_output_w = torch::empty_like(output_w);
-    auto grad_output_b = torch::empty_like(output_b);
-    auto grad_norm_w = torch::empty_like(norm_w);
-    auto grad_norm_b = torch::empty_like(norm_b);
-
-    // inputs.
-    const T* grad_output_ptr = (const T*)g_output.data_ptr();
-    const T* input_ptr = (const T*)input.data_ptr();
-    const T* output_ptr = (const T*)output.data_ptr();
-    const T* inp_norm_ptr = (const T*)inp_norm.data_ptr();
-    const T* q_tf_ptr = (const T*)qkv_tf.data_ptr();
-    const T* add_res_ptr = (const T*)add_res.data_ptr();
-    const T* k_tf_ptr =
-        q_tf_ptr + (bsz * layer->GetSeqLength() * output_w.size(0));  //(const T*)k_tf.data_ptr();
-    const T* v_tf_ptr =
-        k_tf_ptr + (bsz * layer->GetSeqLength() * output_w.size(0));  //(const T*)v_tf.data_ptr();
-    const T* ff1_inp_ptr = (const T*)ff1_inp.data_ptr();
-    const T* gelu_inp_ptr = (const T*)gelu_inp.data_ptr();
-    const T* ff2_inp_ptr = (const T*)ff2_inp.data_ptr();
-    const T* ctx_bufB_ptr = (const T*)ctx_bufB.data_ptr();
-    const T* soft_out_ptr = (const T*)soft_out.data_ptr();
-    const T* attn_o_inp_ptr = (const T*)attn_o_inp.data_ptr();
-    const T* input_mask_ptr = (const T*)input_mask.data_ptr();
-    const T* attn_qkvw_ptr = (const T*)attn_qkvw.data_ptr();
-    const T* attn_ow_ptr = (const T*)attn_ow.data_ptr();
-    const T* attn_nw_ptr = (const T*)attn_nw.data_ptr();
-    const T* attn_nb_ptr = (const T*)attn_nb.data_ptr();
-    const T* inter_w_ptr = (const T*)inter_w.data_ptr();
-    const T* inter_b_ptr = (const T*)inter_b.data_ptr();
-    const T* output_w_ptr = (const T*)output_w.data_ptr();
-    const T* norm_w_ptr = (const T*)norm_w.data_ptr();
-    const T* norm_b_ptr = (const T*)norm_b.data_ptr();
-
-    // outputs.
-    T* grad_input_ptr = (T*)grad_input.data_ptr();
-    T* grad_attn_qkvw_ptr = (T*)grad_attn_qkvw.data_ptr();
-    T* grad_attn_qkvb_ptr = (T*)grad_attn_qkvb.data_ptr();
-    T* grad_attn_ow_ptr = (T*)grad_attn_ow.data_ptr();
-    T* grad_attn_ob_ptr = (T*)grad_attn_ob.data_ptr();
-    T* grad_attn_nw_ptr = (T*)grad_attn_nw.data_ptr();
-    T* grad_attn_nb_ptr = (T*)grad_attn_nb.data_ptr();
-    T* grad_inter_w_ptr = (T*)grad_inter_w.data_ptr();
-    T* grad_inter_b_ptr = (T*)grad_inter_b.data_ptr();
-    T* grad_output_w_ptr = (T*)grad_output_w.data_ptr();
-    T* grad_output_b_ptr = (T*)grad_output_b.data_ptr();
-    T* grad_norm_w_ptr = (T*)grad_norm_w.data_ptr();
-    T* grad_norm_b_ptr = (T*)grad_norm_b.data_ptr();
-
-    layer->SetIntermediateBuffers((uint8_t*)attn_prob_dropout_mask.data_ptr(),
-                                  (uint8_t*)attn_output_dropout_mask.data_ptr(),
-                                  (uint8_t*)layer_output_dropout_mask.data_ptr(),
-                                  (T*)attn_layer_norm_var.data_ptr(),
-                                  (T*)attn_layer_norm_mean.data_ptr(),
-                                  (T*)layer_norm_var.data_ptr(),
-                                  (T*)layer_norm_mean.data_ptr());
-
-    layer->Backward(bsz,
-                    grad_output_ptr,
-                    input_ptr,
-                    output_ptr,
-                    inp_norm_ptr,
-                    q_tf_ptr,
-                    k_tf_ptr,
-                    v_tf_ptr,
-                    soft_out_ptr,
-                    ctx_bufB_ptr,
-                    attn_o_inp_ptr,
-                    add_res_ptr,
-                    ff1_inp_ptr,
-                    gelu_inp_ptr,
-                    ff2_inp_ptr,
-                    input_mask_ptr,
-                    attn_qkvw_ptr,
-                    attn_ow_ptr,
-                    attn_nw_ptr,
-                    attn_nb_ptr,
-                    inter_w_ptr,
-                    inter_b_ptr,
-                    output_w_ptr,
-                    norm_w_ptr,
-                    norm_b_ptr,
-
-                    grad_input_ptr,
-                    grad_attn_qkvw_ptr,
-                    grad_attn_qkvb_ptr,
-                    grad_attn_ow_ptr,
-                    grad_attn_ob_ptr,
-                    grad_attn_nw_ptr,
-                    grad_attn_nb_ptr,
-                    grad_inter_w_ptr,
-                    grad_inter_b_ptr,
-                    grad_output_w_ptr,
-                    grad_output_b_ptr,
-                    grad_norm_w_ptr,
-                    grad_norm_b_ptr);
-
-    return {grad_input,
-            grad_attn_qkvw,
-            grad_attn_qkvb,
-            grad_attn_ow,
-            grad_attn_ob,
-            grad_attn_nw,
-            grad_attn_nb,
-            grad_inter_w,
-            grad_inter_b,
-            grad_output_w,
-            grad_output_b,
-            grad_norm_w,
-            grad_norm_b};
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("forward_fp32",
-          &ds_transformer_forward<float>,
-          "DeepSpeed Transformer forward with fp32 (CUDA)");
-    m.def("forward_fp16",
-          &ds_transformer_forward<__half>,
-          "DeepSpeed Transformer forward with fp16 (CUDA)");
-    m.def("backward_fp32",
-          &ds_transformer_backward<float>,
-          "DeepSpeed Transformer backward with fp32 (CUDA)");
-    m.def("backward_fp16",
-          &ds_transformer_backward<__half>,
-          "DeepSpeed Transformer backward with fp16 (CUDA)");
-    m.def("create_transformer_layer_fp32",
-          &create_transformer_layer<float>,
-          "Create DeepSpeed Transformer Transformer Layer with fp32 (CUDA)");
-    m.def("create_transformer_layer_fp16",
-          &create_transformer_layer<__half>,
-          "Create DeepSpeed Transformer Transformer Layer with fp16 (CUDA)");
-}
diff --git a/deepspeed/ops/csrc/transformer/gelu_kernels.cu b/deepspeed/ops/csrc/transformer/gelu_kernels.cu
deleted file mode 100644
index d683cf0af83daf829e390d83182eb6f0ffd040a6..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/gelu_kernels.cu
+++ /dev/null
@@ -1,330 +0,0 @@
-#include "custom_cuda_layers.h"
-
-inline __device__ float gelu(const float x)
-{
-    const float sqrt_param = 0.79788456080286535587989211986876f;
-    const float mul_param = 0.044715;
-    return x * 0.5f * (1.0f + tanhf(sqrt_param * (x + mul_param * x * x * x)));
-}
-
-inline __device__ float d_gelu(const float x)
-{
-    const float sqrt_param = 0.79788456080286535587989211986876f;
-    const float mul_param = 0.044715;
-
-    float x2mul = x * x * mul_param;
-    float tan_h = tanhf(sqrt_param * (x + x * x2mul));
-    float dg1 = 0.5f * (1.0f + tan_h);
-    float dg2 = x * 0.5f * sqrt_param * (1 - tan_h * tan_h);
-    float dg3 = dg2 * 3 * x2mul;
-    return (dg1 + dg2 + dg3);
-}
-
-/*
-Fused bias add with GELU
-
-Loads a vector of 4 elements each iteration, for stride
-iterations. It was written with the intention to launch 256 thread
-threadblocks, so to launch for bert-large, we would set ITERATIONS
-to 4. This is currently done automatically as a heuristic, setting
-the number of iterations as blocks of 1024.
-
-For FP16, the values are loaded from memory as __half, but converted
-to FP32 for the arithmetic itself, to prevent numerous overflow on
-the intermediate hyperbolic tangent, since there's no intrinsic
-that computes it directly.
-*/
-
-__global__ void gelu_kernel(const float* input, float* vals, int row_stride, int iterations)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    const float4* input_cast = reinterpret_cast<const float4*>(input);
-    float4* vals_cast = reinterpret_cast<float4*>(vals);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float4 data = input_cast[row * row_stride + i * loop_stride + id];
-
-            data.x = gelu(data.x);
-            data.y = gelu(data.y);
-            data.z = gelu(data.z);
-            data.w = gelu(data.w);
-
-            vals_cast[row * row_stride + i * loop_stride + id] = data;
-        }
-    }
-}
-
-__global__ void gelu_kernel(const __half* input, __half* vals, int row_stride, int iterations)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    const float2* input_cast = reinterpret_cast<const float2*>(input);
-    float2* vals_cast = reinterpret_cast<float2*>(vals);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float2 vals_vec = input_cast[row * row_stride + i * loop_stride + id];
-
-            __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-
-            float2 low_data = __half22float2(vals_half[0]);
-            float2 high_data = __half22float2(vals_half[1]);
-
-            low_data.x = gelu(low_data.x);
-            low_data.y = gelu(low_data.y);
-            high_data.x = gelu(high_data.x);
-            high_data.y = gelu(high_data.y);
-
-            vals_half[0] = __float22half2_rn(low_data);
-            vals_half[1] = __float22half2_rn(high_data);
-
-            vals_cast[row * row_stride + i * loop_stride + id] = vals_vec;
-        }
-    }
-#endif
-}
-
-__global__ void fused_bias_gelu(const float* input,
-                                const float* bias,
-                                float* vals,
-                                int row_stride,
-                                int iterations)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    const float4* input_cast = reinterpret_cast<const float4*>(input);
-    float4* vals_cast = reinterpret_cast<float4*>(vals);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float4 data = input_cast[row * row_stride + i * loop_stride + id];
-            float4 bias_data = bias_cast[i * loop_stride + id];
-
-            data.x += bias_data.x;
-            data.y += bias_data.y;
-            data.z += bias_data.z;
-            data.w += bias_data.w;
-
-            data.x = gelu(data.x);
-            data.y = gelu(data.y);
-            data.z = gelu(data.z);
-            data.w = gelu(data.w);
-
-            vals_cast[row * row_stride + i * loop_stride + id] = data;
-        }
-    }
-}
-
-__global__ void fused_bias_gelu(const __half* input,
-                                const __half* bias,
-                                __half* vals,
-                                int row_stride,
-                                int iterations)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    const float2* input_cast = reinterpret_cast<const float2*>(input);
-    float2* vals_cast = reinterpret_cast<float2*>(vals);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float2 vals_vec = input_cast[row * row_stride + i * loop_stride + id];
-            float2 bias_vec = bias_cast[i * loop_stride + id];
-
-            __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-            __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-            float2 low_data = __half22float2(vals_half[0]);
-            float2 high_data = __half22float2(vals_half[1]);
-
-            float2 low_bias = __half22float2(bias_half[0]);
-            float2 high_bias = __half22float2(bias_half[1]);
-
-            low_data.x += low_bias.x;
-            low_data.y += low_bias.y;
-            high_data.x += high_bias.x;
-            high_data.y += high_bias.y;
-
-            low_data.x = gelu(low_data.x);
-            low_data.y = gelu(low_data.y);
-            high_data.x = gelu(high_data.x);
-            high_data.y = gelu(high_data.y);
-
-            vals_half[0] = __float22half2_rn(low_data);
-            vals_half[1] = __float22half2_rn(high_data);
-
-            vals_cast[row * row_stride + i * loop_stride + id] = vals_vec;
-        }
-    }
-#endif
-}
-
-__global__ void d_gelu_func(float* d_output,
-                            const float* gelu_input,
-                            const float* bias,
-                            int row_stride,
-                            int iterations)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    float4* d_output_cast = reinterpret_cast<float4*>(d_output);
-    const float4* gelu_input_cast = reinterpret_cast<const float4*>(gelu_input);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float4 output_data = d_output_cast[row * row_stride + i * loop_stride + id];
-            float4 gelu_input_data = gelu_input_cast[row * row_stride + i * loop_stride + id];
-            float4 bias_data = bias_cast[i * loop_stride + id];
-
-            gelu_input_data.x += bias_data.x;
-            gelu_input_data.y += bias_data.y;
-            gelu_input_data.z += bias_data.z;
-            gelu_input_data.w += bias_data.w;
-
-            output_data.x *= d_gelu(gelu_input_data.x);
-            output_data.y *= d_gelu(gelu_input_data.y);
-            output_data.z *= d_gelu(gelu_input_data.z);
-            output_data.w *= d_gelu(gelu_input_data.w);
-
-            d_output_cast[row * row_stride + i * loop_stride + id] = output_data;
-        }
-    }
-}
-
-__global__ void d_gelu_func(__half* d_output,
-                            const __half* gelu_input,
-                            const __half* bias,
-                            int row_stride,
-                            int iterations)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    float2* d_output_cast = reinterpret_cast<float2*>(d_output);
-    const float2* gelu_input_cast = reinterpret_cast<const float2*>(gelu_input);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float2 output_data = d_output_cast[row * row_stride + i * loop_stride + id];
-            float2 gelu_input_data = gelu_input_cast[row * row_stride + i * loop_stride + id];
-            float2 bias_vec = bias_cast[i * loop_stride + id];
-
-            __half2* output_data_half = reinterpret_cast<__half2*>(&output_data);
-            __half2* gelu_input_data_half = reinterpret_cast<__half2*>(&gelu_input_data);
-            __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-            float2 output_half_0 = __half22float2(output_data_half[0]);
-            float2 output_half_1 = __half22float2(output_data_half[1]);
-
-            float2 gelu_input_half_0 = __half22float2(gelu_input_data_half[0]);
-            float2 gelu_input_half_1 = __half22float2(gelu_input_data_half[1]);
-
-            float2 bias_half_0 = __half22float2(bias_half[0]);
-            float2 bias_half_1 = __half22float2(bias_half[1]);
-
-            gelu_input_half_0.x += bias_half_0.x;
-            gelu_input_half_0.y += bias_half_0.y;
-            gelu_input_half_1.x += bias_half_1.x;
-            gelu_input_half_1.y += bias_half_1.y;
-
-            output_half_0.x *= d_gelu(gelu_input_half_0.x);
-            output_half_0.y *= d_gelu(gelu_input_half_0.y);
-            output_half_1.x *= d_gelu(gelu_input_half_1.x);
-            output_half_1.y *= d_gelu(gelu_input_half_1.y);
-
-            float2 result;
-            __half2* result_half2 = reinterpret_cast<__half2*>(&result);
-
-            result_half2[0] = __float22half2_rn(output_half_0);
-            result_half2[1] = __float22half2_rn(output_half_1);
-
-            d_output_cast[row * row_stride + i * loop_stride + id] = result;
-        }
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_gelu(const T* input,
-                      const T* bias,
-                      T* output,
-                      int intermediate_size,
-                      int batch_size,
-                      cudaStream_t stream)
-{
-    int iterations = (intermediate_size + 1023) / 1024;
-    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
-    dim3 block_dims(threads);
-    dim3 grid_dims(batch_size);
-
-    fused_bias_gelu<<<grid_dims, block_dims, 0, stream>>>(
-        input, bias, output, intermediate_size / 4, iterations);
-}
-
-template <typename T>
-void launch_gelu(const T* input,
-                 T* output,
-                 int intermediate_size,
-                 int batch_size,
-                 cudaStream_t stream)
-{
-    int iterations = (intermediate_size + 1023) / 1024;
-    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
-    dim3 block_dims(threads);
-    dim3 grid_dims(batch_size);
-
-    gelu_kernel<<<grid_dims, block_dims, 0, stream>>>(
-        input, output, intermediate_size / 4, iterations);
-}
-
-template void launch_bias_gelu<float>(const float*, const float*, float*, int, int, cudaStream_t);
-template void launch_bias_gelu<__half>(const __half*,
-                                       const __half*,
-                                       __half*,
-                                       int,
-                                       int,
-                                       cudaStream_t);
-
-template void launch_gelu<float>(const float*, float*, int, int, cudaStream_t);
-template void launch_gelu<__half>(const __half*, __half*, int, int, cudaStream_t);
-
-template <typename T>
-void launch_d_gelu(T* d_output,
-                   const T* input,
-                   const T* bias,
-                   int intermediate_size,
-                   int batch_size,
-                   cudaStream_t stream)
-{
-    int iterations = (intermediate_size + 1023) / 1024;
-    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
-    dim3 block_dims(threads);
-    dim3 grid_dims(batch_size);
-
-    d_gelu_func<<<grid_dims, block_dims, 0, stream>>>(
-        d_output, input, bias, intermediate_size / 4, iterations);
-}
-
-template void launch_d_gelu<float>(float*, const float*, const float*, int, int, cudaStream_t);
-template void launch_d_gelu<__half>(__half*, const __half*, const __half*, int, int, cudaStream_t);
diff --git a/deepspeed/ops/csrc/transformer/gelu_kernels.hip b/deepspeed/ops/csrc/transformer/gelu_kernels.hip
deleted file mode 100644
index f7e7a7fa7e0f22a6d20de4a1fbb5c2071afb9c77..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/gelu_kernels.hip
+++ /dev/null
@@ -1,332 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-
-inline __device__ float gelu(const float x)
-{
-    const float sqrt_param = 0.79788456080286535587989211986876f;
-    const float mul_param = 0.044715;
-    return x * 0.5f * (1.0f + tanhf(sqrt_param * (x + mul_param * x * x * x)));
-}
-
-inline __device__ float d_gelu(const float x)
-{
-    const float sqrt_param = 0.79788456080286535587989211986876f;
-    const float mul_param = 0.044715;
-
-    float x2mul = x * x * mul_param;
-    float tan_h = tanhf(sqrt_param * (x + x * x2mul));
-    float dg1 = 0.5f * (1.0f + tan_h);
-    float dg2 = x * 0.5f * sqrt_param * (1 - tan_h * tan_h);
-    float dg3 = dg2 * 3 * x2mul;
-    return (dg1 + dg2 + dg3);
-}
-
-/*
-Fused bias add with GELU
-
-Loads a vector of 4 elements each iteration, for stride
-iterations. It was written with the intention to launch 256 thread
-threadblocks, so to launch for bert-large, we would set ITERATIONS
-to 4. This is currently done automatically as a heuristic, setting
-the number of iterations as blocks of 1024.
-
-For FP16, the values are loaded from memory as __half, but converted
-to FP32 for the arithmetic itself, to prevent numerous overflow on
-the intermediate hyperbolic tangent, since there's no intrinsic
-that computes it directly.
-*/
-
-__global__ void gelu_kernel(const float* input, float* vals, int row_stride, int iterations)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    const float4* input_cast = reinterpret_cast<const float4*>(input);
-    float4* vals_cast = reinterpret_cast<float4*>(vals);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float4 data = input_cast[row * row_stride + i * loop_stride + id];
-
-            data.x = gelu(data.x);
-            data.y = gelu(data.y);
-            data.z = gelu(data.z);
-            data.w = gelu(data.w);
-
-            vals_cast[row * row_stride + i * loop_stride + id] = data;
-        }
-    }
-}
-
-__global__ void gelu_kernel(const __half* input, __half* vals, int row_stride, int iterations)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    const float2* input_cast = reinterpret_cast<const float2*>(input);
-    float2* vals_cast = reinterpret_cast<float2*>(vals);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float2 vals_vec = input_cast[row * row_stride + i * loop_stride + id];
-
-            __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-
-            float2 low_data = __half22float2(vals_half[0]);
-            float2 high_data = __half22float2(vals_half[1]);
-
-            low_data.x = gelu(low_data.x);
-            low_data.y = gelu(low_data.y);
-            high_data.x = gelu(high_data.x);
-            high_data.y = gelu(high_data.y);
-
-            vals_half[0] = __float22half2_rn(low_data);
-            vals_half[1] = __float22half2_rn(high_data);
-
-            vals_cast[row * row_stride + i * loop_stride + id] = vals_vec;
-        }
-    }
-#endif
-}
-
-__global__ void fused_bias_gelu(const float* input,
-                                const float* bias,
-                                float* vals,
-                                int row_stride,
-                                int iterations)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    const float4* input_cast = reinterpret_cast<const float4*>(input);
-    float4* vals_cast = reinterpret_cast<float4*>(vals);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float4 data = input_cast[row * row_stride + i * loop_stride + id];
-            float4 bias_data = bias_cast[i * loop_stride + id];
-
-            data.x += bias_data.x;
-            data.y += bias_data.y;
-            data.z += bias_data.z;
-            data.w += bias_data.w;
-
-            data.x = gelu(data.x);
-            data.y = gelu(data.y);
-            data.z = gelu(data.z);
-            data.w = gelu(data.w);
-
-            vals_cast[row * row_stride + i * loop_stride + id] = data;
-        }
-    }
-}
-
-__global__ void fused_bias_gelu(const __half* input,
-                                const __half* bias,
-                                __half* vals,
-                                int row_stride,
-                                int iterations)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    const float2* input_cast = reinterpret_cast<const float2*>(input);
-    float2* vals_cast = reinterpret_cast<float2*>(vals);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float2 vals_vec = input_cast[row * row_stride + i * loop_stride + id];
-            float2 bias_vec = bias_cast[i * loop_stride + id];
-
-            __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-            __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-            float2 low_data = __half22float2(vals_half[0]);
-            float2 high_data = __half22float2(vals_half[1]);
-
-            float2 low_bias = __half22float2(bias_half[0]);
-            float2 high_bias = __half22float2(bias_half[1]);
-
-            low_data.x += low_bias.x;
-            low_data.y += low_bias.y;
-            high_data.x += high_bias.x;
-            high_data.y += high_bias.y;
-
-            low_data.x = gelu(low_data.x);
-            low_data.y = gelu(low_data.y);
-            high_data.x = gelu(high_data.x);
-            high_data.y = gelu(high_data.y);
-
-            vals_half[0] = __float22half2_rn(low_data);
-            vals_half[1] = __float22half2_rn(high_data);
-
-            vals_cast[row * row_stride + i * loop_stride + id] = vals_vec;
-        }
-    }
-#endif
-}
-
-__global__ void d_gelu_func(float* d_output,
-                            const float* gelu_input,
-                            const float* bias,
-                            int row_stride,
-                            int iterations)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    float4* d_output_cast = reinterpret_cast<float4*>(d_output);
-    const float4* gelu_input_cast = reinterpret_cast<const float4*>(gelu_input);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float4 output_data = d_output_cast[row * row_stride + i * loop_stride + id];
-            float4 gelu_input_data = gelu_input_cast[row * row_stride + i * loop_stride + id];
-            float4 bias_data = bias_cast[i * loop_stride + id];
-
-            gelu_input_data.x += bias_data.x;
-            gelu_input_data.y += bias_data.y;
-            gelu_input_data.z += bias_data.z;
-            gelu_input_data.w += bias_data.w;
-
-            output_data.x *= d_gelu(gelu_input_data.x);
-            output_data.y *= d_gelu(gelu_input_data.y);
-            output_data.z *= d_gelu(gelu_input_data.z);
-            output_data.w *= d_gelu(gelu_input_data.w);
-
-            d_output_cast[row * row_stride + i * loop_stride + id] = output_data;
-        }
-    }
-}
-
-__global__ void d_gelu_func(__half* d_output,
-                            const __half* gelu_input,
-                            const __half* bias,
-                            int row_stride,
-                            int iterations)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    float2* d_output_cast = reinterpret_cast<float2*>(d_output);
-    const float2* gelu_input_cast = reinterpret_cast<const float2*>(gelu_input);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float2 output_data = d_output_cast[row * row_stride + i * loop_stride + id];
-            float2 gelu_input_data = gelu_input_cast[row * row_stride + i * loop_stride + id];
-            float2 bias_vec = bias_cast[i * loop_stride + id];
-
-            __half2* output_data_half = reinterpret_cast<__half2*>(&output_data);
-            __half2* gelu_input_data_half = reinterpret_cast<__half2*>(&gelu_input_data);
-            __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-            float2 output_half_0 = __half22float2(output_data_half[0]);
-            float2 output_half_1 = __half22float2(output_data_half[1]);
-
-            float2 gelu_input_half_0 = __half22float2(gelu_input_data_half[0]);
-            float2 gelu_input_half_1 = __half22float2(gelu_input_data_half[1]);
-
-            float2 bias_half_0 = __half22float2(bias_half[0]);
-            float2 bias_half_1 = __half22float2(bias_half[1]);
-
-            gelu_input_half_0.x += bias_half_0.x;
-            gelu_input_half_0.y += bias_half_0.y;
-            gelu_input_half_1.x += bias_half_1.x;
-            gelu_input_half_1.y += bias_half_1.y;
-
-            output_half_0.x *= d_gelu(gelu_input_half_0.x);
-            output_half_0.y *= d_gelu(gelu_input_half_0.y);
-            output_half_1.x *= d_gelu(gelu_input_half_1.x);
-            output_half_1.y *= d_gelu(gelu_input_half_1.y);
-
-            float2 result;
-            __half2* result_half2 = reinterpret_cast<__half2*>(&result);
-
-            result_half2[0] = __float22half2_rn(output_half_0);
-            result_half2[1] = __float22half2_rn(output_half_1);
-
-            d_output_cast[row * row_stride + i * loop_stride + id] = result;
-        }
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_gelu(const T* input,
-                      const T* bias,
-                      T* output,
-                      int intermediate_size,
-                      int batch_size,
-                      hipStream_t stream)
-{
-    int iterations = (intermediate_size + 1023) / 1024;
-    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
-    dim3 block_dims(threads);
-    dim3 grid_dims(batch_size);
-
-   hipLaunchKernelGGL(( fused_bias_gelu), dim3(grid_dims), dim3(block_dims), 0, stream, 
-        input, bias, output, intermediate_size / 4, iterations);
-}
-
-template <typename T>
-void launch_gelu(const T* input,
-                 T* output,
-                 int intermediate_size,
-                 int batch_size,
-                 hipStream_t stream)
-{
-    int iterations = (intermediate_size + 1023) / 1024;
-    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
-    dim3 block_dims(threads);
-    dim3 grid_dims(batch_size);
-
-   hipLaunchKernelGGL(( gelu_kernel), dim3(grid_dims), dim3(block_dims), 0, stream, 
-        input, output, intermediate_size / 4, iterations);
-}
-
-template void launch_bias_gelu<float>(const float*, const float*, float*, int, int, hipStream_t);
-template void launch_bias_gelu<__half>(const __half*,
-                                       const __half*,
-                                       __half*,
-                                       int,
-                                       int,
-                                       hipStream_t);
-
-template void launch_gelu<float>(const float*, float*, int, int, hipStream_t);
-template void launch_gelu<__half>(const __half*, __half*, int, int, hipStream_t);
-
-template <typename T>
-void launch_d_gelu(T* d_output,
-                   const T* input,
-                   const T* bias,
-                   int intermediate_size,
-                   int batch_size,
-                   hipStream_t stream)
-{
-    int iterations = (intermediate_size + 1023) / 1024;
-    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
-    dim3 block_dims(threads);
-    dim3 grid_dims(batch_size);
-
-   hipLaunchKernelGGL(( d_gelu_func), dim3(grid_dims), dim3(block_dims), 0, stream, 
-        d_output, input, bias, intermediate_size / 4, iterations);
-}
-
-template void launch_d_gelu<float>(float*, const float*, const float*, int, int, hipStream_t);
-template void launch_d_gelu<__half>(__half*, const __half*, const __half*, int, int, hipStream_t);
diff --git a/deepspeed/ops/csrc/transformer/general_kernels.cu b/deepspeed/ops/csrc/transformer/general_kernels.cu
deleted file mode 100644
index 1eaa94e1e71a40bf44b661656395cf1ed087f589..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/general_kernels.cu
+++ /dev/null
@@ -1,411 +0,0 @@
-#include "general_kernels.h"
-
-namespace cg = cooperative_groups;
-
-template <typename T>
-__global__ void column_sum_reduce(const T* __restrict__ inp,
-                                  T* __restrict__ out,
-                                  int rows,
-                                  int width)
-{
-    __shared__ float tile[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-
-    int y_stride = width * TILE_DIM;
-
-    float localSum = 0;
-
-    // Loop across matrix height
-    if (idx < width) {
-        int offset = threadIdx.y * width + idx;
-        for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-            localSum += (float)inp[offset];
-            offset += y_stride;
-        }
-    }
-
-    tile[threadIdx.x][threadIdx.y] = localSum;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float sum = tile[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) sum += g.shfl_down(sum, i);
-
-    if (threadIdx.x == 0) {
-        int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-        if (pos < width) out[pos] = sum;
-    }
-}
-
-template <typename T>
-void launch_fuse_transpose_bias_kernel(const T* inp,
-                                       T* out,
-                                       int rows,
-                                       int cols,
-                                       cudaStream_t stream);
-
-template <>
-void launch_fuse_transpose_bias_kernel<float>(const float* inp,
-                                              float* out,
-                                              int rows,
-                                              int cols,
-                                              cudaStream_t stream)
-{
-    // assert(rows % TILE_DIM == 0);
-    // assert(cols % TILE_DIM == 0);
-
-    dim3 grid_dim((cols - 1) / TILE_DIM + 1);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    column_sum_reduce<float><<<grid_dim, block_dim, 0, stream>>>(inp, out, rows, cols);
-}
-
-template <>
-void launch_fuse_transpose_bias_kernel<__half>(const __half* inp,
-                                               __half* out,
-                                               int rows,
-                                               int cols,
-                                               cudaStream_t stream)
-{
-    // assert(rows % TILE_DIM == 0);
-    // assert(cols % TILE_DIM == 0);
-
-    dim3 grid_dim((cols - 1) / TILE_DIM + 1);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    column_sum_reduce<__half><<<grid_dim, block_dim, 0, stream>>>(inp, out, rows, cols);
-}
-
-__global__ void fused_add2_kernel(const int N, float* out, const float* inp1, const float* inp2)
-{
-    const float4* inp1_4 = reinterpret_cast<const float4*>(inp1);
-    const float4* inp2_4 = reinterpret_cast<const float4*>(inp2);
-    float4* out_4 = reinterpret_cast<float4*>(out);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 val;
-        float4 inp1_reg = inp1_4[j];
-        float4 inp2_reg = inp2_4[j];
-
-        val.x = inp1_reg.x + inp2_reg.x;
-        val.y = inp1_reg.y + inp2_reg.y;
-        val.z = inp1_reg.z + inp2_reg.z;
-        val.w = inp1_reg.w + inp2_reg.w;
-
-        out_4[j] = val;
-    }
-}
-
-__global__ void fused_add2_kernel(const int N, __half* out, const __half* inp1, const __half* inp2)
-{
-    float2 inp1_4;
-    float2 inp2_4;
-
-    __half2* inp1_h = reinterpret_cast<__half2*>(&inp1_4);
-    __half2* inp2_h = reinterpret_cast<__half2*>(&inp2_4);
-
-    const float2* inp1_arr = reinterpret_cast<const float2*>(inp1);
-    const float2* inp2_arr = reinterpret_cast<const float2*>(inp2);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        inp1_4 = inp1_arr[j];
-        inp2_4 = inp2_arr[j];
-
-        float2 inp1_h_f_0 = __half22float2(inp1_h[0]);
-        float2 inp1_h_f_1 = __half22float2(inp1_h[1]);
-
-        float2 inp2_h_f_0 = __half22float2(inp2_h[0]);
-        float2 inp2_h_f_1 = __half22float2(inp2_h[1]);
-
-        inp1_h_f_0.x += inp2_h_f_0.x;
-        inp1_h_f_0.y += inp2_h_f_0.y;
-        inp1_h_f_1.x += inp2_h_f_1.x;
-        inp1_h_f_1.y += inp2_h_f_1.y;
-
-        float2 val_f;
-        __half2* val_h = reinterpret_cast<__half2*>(&val_f);
-
-        val_h[0] = __float22half2_rn(inp1_h_f_0);
-        val_h[1] = __float22half2_rn(inp1_h_f_1);
-
-        float2* out_4 = reinterpret_cast<float2*>(out);
-        out_4[j] = val_f;
-    }
-}
-
-template <>
-void launch_fused_add2<float>(float* out,
-                              const float* inp1,
-                              const float* inp2,
-                              int batch_size,
-                              int seq_length,
-                              int hidden_dim,
-                              cudaStream_t& stream)
-{
-    int total_count = batch_size * seq_length * hidden_dim / 4;
-    dim3 grid_dim = DS_GET_BLOCKS(total_count);  //(batch_size * seq_length);
-
-    dim3 block_dim = DS_CUDA_NUM_THREADS;  //(hidden_dim / 4);
-
-    fused_add2_kernel<<<grid_dim, block_dim, 0, stream>>>(total_count, out, inp1, inp2);
-}
-
-template <>
-void launch_fused_add2<__half>(__half* out,
-                               const __half* inp1,
-                               const __half* inp2,
-                               int batch_size,
-                               int seq_length,
-                               int hidden_dim,
-                               cudaStream_t& stream)
-{
-    int total_count = batch_size * seq_length * hidden_dim / 4;
-    dim3 grid_dim = DS_GET_BLOCKS(total_count);  //(batch_size * seq_length);
-
-    dim3 block_dim = DS_CUDA_NUM_THREADS;  //(hidden_dim / 4);
-
-    fused_add2_kernel<<<grid_dim, block_dim, 0, stream>>>(total_count, out, inp1, inp2);
-}
-
-__global__ void fused_add3_kernel(float* out,
-                                  const float* inp1,
-                                  const float* inp2,
-                                  const float* inp3,
-                                  int size,
-                                  int row_stride)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-
-    const float4* inp1_4 = reinterpret_cast<const float4*>(inp1);
-    const float4* inp2_4 = reinterpret_cast<const float4*>(inp2);
-    const float4* inp3_4 = reinterpret_cast<const float4*>(inp3);
-
-    float4* out_4 = reinterpret_cast<float4*>(out);
-
-    float4 val;
-    float4 inp1_reg = inp1_4[row * row_stride + id];
-    float4 inp2_reg = inp2_4[row * row_stride + id];
-    float4 inp3_reg = inp3_4[row * row_stride + id];
-
-    val.x = inp1_reg.x + inp2_reg.x + inp3_reg.x;
-    val.y = inp1_reg.y + inp2_reg.y + inp3_reg.y;
-    val.z = inp1_reg.z + inp2_reg.z + inp3_reg.z;
-    val.w = inp1_reg.w + inp2_reg.w + inp3_reg.w;
-
-    out_4[row * row_stride + id] = val;
-}
-
-__global__ void fused_add3_kernel(__half* out,
-                                  const __half* inp1,
-                                  const __half* inp2,
-                                  const __half* inp3,
-                                  int size,
-                                  int row_stride)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    const float2* inp1_arr = reinterpret_cast<const float2*>(inp1);
-    const float2* inp2_arr = reinterpret_cast<const float2*>(inp2);
-    const float2* inp3_arr = reinterpret_cast<const float2*>(inp3);
-
-    float2 inp1_4 = inp1_arr[row * row_stride + id];
-    float2 inp2_4 = inp2_arr[row * row_stride + id];
-    float2 inp3_4 = inp3_arr[row * row_stride + id];
-
-    __half2* inp1_h = reinterpret_cast<__half2*>(&inp1_4);
-    __half2* inp2_h = reinterpret_cast<__half2*>(&inp2_4);
-    __half2* inp3_h = reinterpret_cast<__half2*>(&inp3_4);
-
-    float2 inp1_h_f_0 = __half22float2(inp1_h[0]);
-    float2 inp1_h_f_1 = __half22float2(inp1_h[1]);
-
-    float2 inp2_h_f_0 = __half22float2(inp2_h[0]);
-    float2 inp2_h_f_1 = __half22float2(inp2_h[1]);
-
-    float2 inp3_h_f_0 = __half22float2(inp3_h[0]);
-    float2 inp3_h_f_1 = __half22float2(inp3_h[1]);
-
-    inp1_h_f_0.x += (inp2_h_f_0.x + inp3_h_f_0.x);
-    inp1_h_f_0.y += (inp2_h_f_0.y + inp3_h_f_0.y);
-    inp1_h_f_1.x += (inp2_h_f_1.x + inp3_h_f_1.x);
-    inp1_h_f_1.y += (inp2_h_f_1.y + inp3_h_f_1.y);
-
-    float2 val_f;
-    __half2* val_h = reinterpret_cast<__half2*>(&val_f);
-
-    val_h[0] = __float22half2_rn(inp1_h_f_0);
-    val_h[1] = __float22half2_rn(inp1_h_f_1);
-
-    float2* out_4 = reinterpret_cast<float2*>(out);
-    out_4[row * row_stride + id] = val_f;
-}
-
-template <>
-void launch_fused_add3<float>(float* out,
-                              const float* inp1,
-                              const float* inp2,
-                              const float* inp3,
-                              int batch_size,
-                              int seq_length,
-                              int hidden_size,
-                              cudaStream_t& stream)
-{
-    dim3 grid_dim(batch_size * seq_length);
-
-    dim3 block_dim(hidden_size / 4);
-
-    fused_add3_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        out, inp1, inp2, inp3, (batch_size * seq_length * hidden_size), hidden_size / 4);
-}
-
-template <>
-void launch_fused_add3<__half>(__half* out,
-                               const __half* inp1,
-                               const __half* inp2,
-                               const __half* inp3,
-                               int batch_size,
-                               int seq_length,
-                               int hidden_size,
-                               cudaStream_t& stream)
-{
-    dim3 grid_dim(batch_size * seq_length);
-
-    dim3 block_dim(hidden_size / 4);
-
-    fused_add3_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        out, inp1, inp2, inp3, (batch_size * seq_length * hidden_size), hidden_size / 4);
-}
-
-__global__ void fused_add4_kernel(float* out,
-                                  const float* inp1,
-                                  const float* inp2,
-                                  const float* inp3,
-                                  const float* inp4,
-                                  int size,
-                                  int row_stride)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-
-    const float4* inp1_4 = reinterpret_cast<const float4*>(inp1);
-    const float4* inp2_4 = reinterpret_cast<const float4*>(inp2);
-    const float4* inp3_4 = reinterpret_cast<const float4*>(inp3);
-    const float4* inp4_4 = reinterpret_cast<const float4*>(inp4);
-    float4* out_4 = reinterpret_cast<float4*>(out);
-
-    float4 val;
-    float4 inp1_reg = inp1_4[row * row_stride + id];
-    float4 inp2_reg = inp2_4[row * row_stride + id];
-    float4 inp3_reg = inp3_4[row * row_stride + id];
-    float4 inp4_reg = inp4_4[row * row_stride + id];
-
-    val.x = inp1_reg.x + inp2_reg.x + inp3_reg.x + inp4_reg.x;
-    val.y = inp1_reg.y + inp2_reg.y + inp3_reg.y + inp4_reg.y;
-    val.z = inp1_reg.z + inp2_reg.z + inp3_reg.z + inp4_reg.z;
-    val.w = inp1_reg.w + inp2_reg.w + inp3_reg.w + inp4_reg.w;
-
-    out_4[row * row_stride + id] = val;
-}
-
-__global__ void fused_add4_kernel(__half* out,
-                                  const __half* inp1,
-                                  const __half* inp2,
-                                  const __half* inp3,
-                                  const __half* inp4,
-                                  int size,
-                                  int row_stride)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    const float2* inp1_arr = reinterpret_cast<const float2*>(inp1);
-    const float2* inp2_arr = reinterpret_cast<const float2*>(inp2);
-    const float2* inp3_arr = reinterpret_cast<const float2*>(inp3);
-    const float2* inp4_arr = reinterpret_cast<const float2*>(inp4);
-
-    float2 inp1_4 = inp1_arr[row * row_stride + id];
-    float2 inp2_4 = inp2_arr[row * row_stride + id];
-    float2 inp3_4 = inp3_arr[row * row_stride + id];
-    float2 inp4_4 = inp4_arr[row * row_stride + id];
-
-    __half2* inp1_h = reinterpret_cast<__half2*>(&inp1_4);
-    __half2* inp2_h = reinterpret_cast<__half2*>(&inp2_4);
-    __half2* inp3_h = reinterpret_cast<__half2*>(&inp3_4);
-    __half2* inp4_h = reinterpret_cast<__half2*>(&inp4_4);
-
-    float2 inp1_h_f_0 = __half22float2(inp1_h[0]);
-    float2 inp1_h_f_1 = __half22float2(inp1_h[1]);
-
-    float2 inp2_h_f_0 = __half22float2(inp2_h[0]);
-    float2 inp2_h_f_1 = __half22float2(inp2_h[1]);
-
-    float2 inp3_h_f_0 = __half22float2(inp3_h[0]);
-    float2 inp3_h_f_1 = __half22float2(inp3_h[1]);
-
-    float2 inp4_h_f_0 = __half22float2(inp4_h[0]);
-    float2 inp4_h_f_1 = __half22float2(inp4_h[1]);
-
-    inp1_h_f_0.x += (inp2_h_f_0.x + inp3_h_f_0.x + inp4_h_f_0.x);
-    inp1_h_f_0.y += (inp2_h_f_0.y + inp3_h_f_0.y + inp4_h_f_0.y);
-    inp1_h_f_1.x += (inp2_h_f_1.x + inp3_h_f_1.x + inp4_h_f_1.x);
-    inp1_h_f_1.y += (inp2_h_f_1.y + inp3_h_f_1.y + inp4_h_f_1.y);
-
-    float2 val_f;
-    __half2* val_h = reinterpret_cast<__half2*>(&val_f);
-
-    val_h[0] = __float22half2_rn(inp1_h_f_0);
-    val_h[1] = __float22half2_rn(inp1_h_f_1);
-
-    float2* out_4 = reinterpret_cast<float2*>(out);
-    out_4[row * row_stride + id] = val_f;
-}
-
-template <>
-void launch_fused_add4<float>(float* out,
-                              const float* inp1,
-                              const float* inp2,
-                              const float* inp3,
-                              const float* inp4,
-                              int batch_size,
-                              int seq_length,
-                              int hidden_size,
-                              cudaStream_t& stream)
-{
-    dim3 grid_dim(batch_size * seq_length);
-
-    dim3 block_dim(hidden_size / 4);
-
-    fused_add4_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        out, inp1, inp2, inp3, inp4, (batch_size * seq_length * hidden_size), hidden_size / 4);
-}
-
-template <>
-void launch_fused_add4<__half>(__half* out,
-                               const __half* inp1,
-                               const __half* inp2,
-                               const __half* inp3,
-                               const __half* inp4,
-                               int batch_size,
-                               int seq_length,
-                               int hidden_size,
-                               cudaStream_t& stream)
-{
-    dim3 grid_dim(batch_size * seq_length);
-
-    dim3 block_dim(hidden_size / 4);
-
-    fused_add4_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        out, inp1, inp2, inp3, inp4, (batch_size * seq_length * hidden_size), hidden_size / 4);
-}
diff --git a/deepspeed/ops/csrc/transformer/general_kernels.hip b/deepspeed/ops/csrc/transformer/general_kernels.hip
deleted file mode 100644
index 5be2fc240debf3dbddba72f0f9587331222910df..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/general_kernels.hip
+++ /dev/null
@@ -1,413 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include "general_kernels_hip.h"
-
-namespace cg = cooperative_groups;
-
-template <typename T>
-__global__ void column_sum_reduce(const T* __restrict__ inp,
-                                  T* __restrict__ out,
-                                  int rows,
-                                  int width)
-{
-    __shared__ float tile[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-
-    int y_stride = width * TILE_DIM;
-
-    float localSum = 0;
-
-    // Loop across matrix height
-    if (idx < width) {
-        int offset = threadIdx.y * width + idx;
-        for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-            localSum += (float)inp[offset];
-            offset += y_stride;
-        }
-    }
-
-    tile[threadIdx.x][threadIdx.y] = localSum;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float sum = tile[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) sum += g.shfl_down(sum, i);
-
-    if (threadIdx.x == 0) {
-        int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-        if (pos < width) out[pos] = sum;
-    }
-}
-
-template <typename T>
-void launch_fuse_transpose_bias_kernel(const T* inp,
-                                       T* out,
-                                       int rows,
-                                       int cols,
-                                       hipStream_t stream);
-
-template <>
-void launch_fuse_transpose_bias_kernel<float>(const float* inp,
-                                              float* out,
-                                              int rows,
-                                              int cols,
-                                              hipStream_t stream)
-{
-    // assert(rows % TILE_DIM == 0);
-    // assert(cols % TILE_DIM == 0);
-
-    dim3 grid_dim((cols - 1) / TILE_DIM + 1);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-   hipLaunchKernelGGL(( column_sum_reduce<float>), dim3(grid_dim), dim3(block_dim), 0, stream, inp, out, rows, cols);
-}
-
-template <>
-void launch_fuse_transpose_bias_kernel<__half>(const __half* inp,
-                                               __half* out,
-                                               int rows,
-                                               int cols,
-                                               hipStream_t stream)
-{
-    // assert(rows % TILE_DIM == 0);
-    // assert(cols % TILE_DIM == 0);
-
-    dim3 grid_dim((cols - 1) / TILE_DIM + 1);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-   hipLaunchKernelGGL(( column_sum_reduce<__half>), dim3(grid_dim), dim3(block_dim), 0, stream, inp, out, rows, cols);
-}
-
-__global__ void fused_add2_kernel(const int N, float* out, const float* inp1, const float* inp2)
-{
-    const float4* inp1_4 = reinterpret_cast<const float4*>(inp1);
-    const float4* inp2_4 = reinterpret_cast<const float4*>(inp2);
-    float4* out_4 = reinterpret_cast<float4*>(out);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 val;
-        float4 inp1_reg = inp1_4[j];
-        float4 inp2_reg = inp2_4[j];
-
-        val.x = inp1_reg.x + inp2_reg.x;
-        val.y = inp1_reg.y + inp2_reg.y;
-        val.z = inp1_reg.z + inp2_reg.z;
-        val.w = inp1_reg.w + inp2_reg.w;
-
-        out_4[j] = val;
-    }
-}
-
-__global__ void fused_add2_kernel(const int N, __half* out, const __half* inp1, const __half* inp2)
-{
-    float2 inp1_4;
-    float2 inp2_4;
-
-    __half2* inp1_h = reinterpret_cast<__half2*>(&inp1_4);
-    __half2* inp2_h = reinterpret_cast<__half2*>(&inp2_4);
-
-    const float2* inp1_arr = reinterpret_cast<const float2*>(inp1);
-    const float2* inp2_arr = reinterpret_cast<const float2*>(inp2);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        inp1_4 = inp1_arr[j];
-        inp2_4 = inp2_arr[j];
-
-        float2 inp1_h_f_0 = __half22float2(inp1_h[0]);
-        float2 inp1_h_f_1 = __half22float2(inp1_h[1]);
-
-        float2 inp2_h_f_0 = __half22float2(inp2_h[0]);
-        float2 inp2_h_f_1 = __half22float2(inp2_h[1]);
-
-        inp1_h_f_0.x += inp2_h_f_0.x;
-        inp1_h_f_0.y += inp2_h_f_0.y;
-        inp1_h_f_1.x += inp2_h_f_1.x;
-        inp1_h_f_1.y += inp2_h_f_1.y;
-
-        float2 val_f;
-        __half2* val_h = reinterpret_cast<__half2*>(&val_f);
-
-        val_h[0] = __float22half2_rn(inp1_h_f_0);
-        val_h[1] = __float22half2_rn(inp1_h_f_1);
-
-        float2* out_4 = reinterpret_cast<float2*>(out);
-        out_4[j] = val_f;
-    }
-}
-
-template <>
-void launch_fused_add2<float>(float* out,
-                              const float* inp1,
-                              const float* inp2,
-                              int batch_size,
-                              int seq_length,
-                              int hidden_dim,
-                              hipStream_t& stream)
-{
-    int total_count = batch_size * seq_length * hidden_dim / 4;
-    dim3 grid_dim = DS_GET_BLOCKS(total_count);  //(batch_size * seq_length);
-
-    dim3 block_dim = DS_CUDA_NUM_THREADS;  //(hidden_dim / 4);
-
-   hipLaunchKernelGGL(( fused_add2_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, total_count, out, inp1, inp2);
-}
-
-template <>
-void launch_fused_add2<__half>(__half* out,
-                               const __half* inp1,
-                               const __half* inp2,
-                               int batch_size,
-                               int seq_length,
-                               int hidden_dim,
-                               hipStream_t& stream)
-{
-    int total_count = batch_size * seq_length * hidden_dim / 4;
-    dim3 grid_dim = DS_GET_BLOCKS(total_count);  //(batch_size * seq_length);
-
-    dim3 block_dim = DS_CUDA_NUM_THREADS;  //(hidden_dim / 4);
-
-   hipLaunchKernelGGL(( fused_add2_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, total_count, out, inp1, inp2);
-}
-
-__global__ void fused_add3_kernel(float* out,
-                                  const float* inp1,
-                                  const float* inp2,
-                                  const float* inp3,
-                                  int size,
-                                  int row_stride)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-
-    const float4* inp1_4 = reinterpret_cast<const float4*>(inp1);
-    const float4* inp2_4 = reinterpret_cast<const float4*>(inp2);
-    const float4* inp3_4 = reinterpret_cast<const float4*>(inp3);
-
-    float4* out_4 = reinterpret_cast<float4*>(out);
-
-    float4 val;
-    float4 inp1_reg = inp1_4[row * row_stride + id];
-    float4 inp2_reg = inp2_4[row * row_stride + id];
-    float4 inp3_reg = inp3_4[row * row_stride + id];
-
-    val.x = inp1_reg.x + inp2_reg.x + inp3_reg.x;
-    val.y = inp1_reg.y + inp2_reg.y + inp3_reg.y;
-    val.z = inp1_reg.z + inp2_reg.z + inp3_reg.z;
-    val.w = inp1_reg.w + inp2_reg.w + inp3_reg.w;
-
-    out_4[row * row_stride + id] = val;
-}
-
-__global__ void fused_add3_kernel(__half* out,
-                                  const __half* inp1,
-                                  const __half* inp2,
-                                  const __half* inp3,
-                                  int size,
-                                  int row_stride)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    const float2* inp1_arr = reinterpret_cast<const float2*>(inp1);
-    const float2* inp2_arr = reinterpret_cast<const float2*>(inp2);
-    const float2* inp3_arr = reinterpret_cast<const float2*>(inp3);
-
-    float2 inp1_4 = inp1_arr[row * row_stride + id];
-    float2 inp2_4 = inp2_arr[row * row_stride + id];
-    float2 inp3_4 = inp3_arr[row * row_stride + id];
-
-    __half2* inp1_h = reinterpret_cast<__half2*>(&inp1_4);
-    __half2* inp2_h = reinterpret_cast<__half2*>(&inp2_4);
-    __half2* inp3_h = reinterpret_cast<__half2*>(&inp3_4);
-
-    float2 inp1_h_f_0 = __half22float2(inp1_h[0]);
-    float2 inp1_h_f_1 = __half22float2(inp1_h[1]);
-
-    float2 inp2_h_f_0 = __half22float2(inp2_h[0]);
-    float2 inp2_h_f_1 = __half22float2(inp2_h[1]);
-
-    float2 inp3_h_f_0 = __half22float2(inp3_h[0]);
-    float2 inp3_h_f_1 = __half22float2(inp3_h[1]);
-
-    inp1_h_f_0.x += (inp2_h_f_0.x + inp3_h_f_0.x);
-    inp1_h_f_0.y += (inp2_h_f_0.y + inp3_h_f_0.y);
-    inp1_h_f_1.x += (inp2_h_f_1.x + inp3_h_f_1.x);
-    inp1_h_f_1.y += (inp2_h_f_1.y + inp3_h_f_1.y);
-
-    float2 val_f;
-    __half2* val_h = reinterpret_cast<__half2*>(&val_f);
-
-    val_h[0] = __float22half2_rn(inp1_h_f_0);
-    val_h[1] = __float22half2_rn(inp1_h_f_1);
-
-    float2* out_4 = reinterpret_cast<float2*>(out);
-    out_4[row * row_stride + id] = val_f;
-}
-
-template <>
-void launch_fused_add3<float>(float* out,
-                              const float* inp1,
-                              const float* inp2,
-                              const float* inp3,
-                              int batch_size,
-                              int seq_length,
-                              int hidden_size,
-                              hipStream_t& stream)
-{
-    dim3 grid_dim(batch_size * seq_length);
-
-    dim3 block_dim(hidden_size / 4);
-
-   hipLaunchKernelGGL(( fused_add3_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        out, inp1, inp2, inp3, (batch_size * seq_length * hidden_size), hidden_size / 4);
-}
-
-template <>
-void launch_fused_add3<__half>(__half* out,
-                               const __half* inp1,
-                               const __half* inp2,
-                               const __half* inp3,
-                               int batch_size,
-                               int seq_length,
-                               int hidden_size,
-                               hipStream_t& stream)
-{
-    dim3 grid_dim(batch_size * seq_length);
-
-    dim3 block_dim(hidden_size / 4);
-
-   hipLaunchKernelGGL(( fused_add3_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        out, inp1, inp2, inp3, (batch_size * seq_length * hidden_size), hidden_size / 4);
-}
-
-__global__ void fused_add4_kernel(float* out,
-                                  const float* inp1,
-                                  const float* inp2,
-                                  const float* inp3,
-                                  const float* inp4,
-                                  int size,
-                                  int row_stride)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-
-    const float4* inp1_4 = reinterpret_cast<const float4*>(inp1);
-    const float4* inp2_4 = reinterpret_cast<const float4*>(inp2);
-    const float4* inp3_4 = reinterpret_cast<const float4*>(inp3);
-    const float4* inp4_4 = reinterpret_cast<const float4*>(inp4);
-    float4* out_4 = reinterpret_cast<float4*>(out);
-
-    float4 val;
-    float4 inp1_reg = inp1_4[row * row_stride + id];
-    float4 inp2_reg = inp2_4[row * row_stride + id];
-    float4 inp3_reg = inp3_4[row * row_stride + id];
-    float4 inp4_reg = inp4_4[row * row_stride + id];
-
-    val.x = inp1_reg.x + inp2_reg.x + inp3_reg.x + inp4_reg.x;
-    val.y = inp1_reg.y + inp2_reg.y + inp3_reg.y + inp4_reg.y;
-    val.z = inp1_reg.z + inp2_reg.z + inp3_reg.z + inp4_reg.z;
-    val.w = inp1_reg.w + inp2_reg.w + inp3_reg.w + inp4_reg.w;
-
-    out_4[row * row_stride + id] = val;
-}
-
-__global__ void fused_add4_kernel(__half* out,
-                                  const __half* inp1,
-                                  const __half* inp2,
-                                  const __half* inp3,
-                                  const __half* inp4,
-                                  int size,
-                                  int row_stride)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    const float2* inp1_arr = reinterpret_cast<const float2*>(inp1);
-    const float2* inp2_arr = reinterpret_cast<const float2*>(inp2);
-    const float2* inp3_arr = reinterpret_cast<const float2*>(inp3);
-    const float2* inp4_arr = reinterpret_cast<const float2*>(inp4);
-
-    float2 inp1_4 = inp1_arr[row * row_stride + id];
-    float2 inp2_4 = inp2_arr[row * row_stride + id];
-    float2 inp3_4 = inp3_arr[row * row_stride + id];
-    float2 inp4_4 = inp4_arr[row * row_stride + id];
-
-    __half2* inp1_h = reinterpret_cast<__half2*>(&inp1_4);
-    __half2* inp2_h = reinterpret_cast<__half2*>(&inp2_4);
-    __half2* inp3_h = reinterpret_cast<__half2*>(&inp3_4);
-    __half2* inp4_h = reinterpret_cast<__half2*>(&inp4_4);
-
-    float2 inp1_h_f_0 = __half22float2(inp1_h[0]);
-    float2 inp1_h_f_1 = __half22float2(inp1_h[1]);
-
-    float2 inp2_h_f_0 = __half22float2(inp2_h[0]);
-    float2 inp2_h_f_1 = __half22float2(inp2_h[1]);
-
-    float2 inp3_h_f_0 = __half22float2(inp3_h[0]);
-    float2 inp3_h_f_1 = __half22float2(inp3_h[1]);
-
-    float2 inp4_h_f_0 = __half22float2(inp4_h[0]);
-    float2 inp4_h_f_1 = __half22float2(inp4_h[1]);
-
-    inp1_h_f_0.x += (inp2_h_f_0.x + inp3_h_f_0.x + inp4_h_f_0.x);
-    inp1_h_f_0.y += (inp2_h_f_0.y + inp3_h_f_0.y + inp4_h_f_0.y);
-    inp1_h_f_1.x += (inp2_h_f_1.x + inp3_h_f_1.x + inp4_h_f_1.x);
-    inp1_h_f_1.y += (inp2_h_f_1.y + inp3_h_f_1.y + inp4_h_f_1.y);
-
-    float2 val_f;
-    __half2* val_h = reinterpret_cast<__half2*>(&val_f);
-
-    val_h[0] = __float22half2_rn(inp1_h_f_0);
-    val_h[1] = __float22half2_rn(inp1_h_f_1);
-
-    float2* out_4 = reinterpret_cast<float2*>(out);
-    out_4[row * row_stride + id] = val_f;
-}
-
-template <>
-void launch_fused_add4<float>(float* out,
-                              const float* inp1,
-                              const float* inp2,
-                              const float* inp3,
-                              const float* inp4,
-                              int batch_size,
-                              int seq_length,
-                              int hidden_size,
-                              hipStream_t& stream)
-{
-    dim3 grid_dim(batch_size * seq_length);
-
-    dim3 block_dim(hidden_size / 4);
-
-   hipLaunchKernelGGL(( fused_add4_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        out, inp1, inp2, inp3, inp4, (batch_size * seq_length * hidden_size), hidden_size / 4);
-}
-
-template <>
-void launch_fused_add4<__half>(__half* out,
-                               const __half* inp1,
-                               const __half* inp2,
-                               const __half* inp3,
-                               const __half* inp4,
-                               int batch_size,
-                               int seq_length,
-                               int hidden_size,
-                               hipStream_t& stream)
-{
-    dim3 grid_dim(batch_size * seq_length);
-
-    dim3 block_dim(hidden_size / 4);
-
-   hipLaunchKernelGGL(( fused_add4_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        out, inp1, inp2, inp3, inp4, (batch_size * seq_length * hidden_size), hidden_size / 4);
-}
diff --git a/deepspeed/ops/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu b/deepspeed/ops/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu
deleted file mode 100644
index 175854b8860b92e68485342a21bc7d636d58065c..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu
+++ /dev/null
@@ -1,374 +0,0 @@
-#include "custom_cuda_layers.h"
-
-#ifndef __HIP_PLATFORM_HCC__
-#include <cuda_profiler_api.h>
-#endif
-
-namespace cg = cooperative_groups;
-
-__global__ void apply_rotary_pos_emb(float* mixed_query,
-                                     float* key_layer,
-                                     unsigned rotary_dim,
-                                     unsigned seq_len,
-                                     unsigned seq_offset,
-                                     unsigned num_heads,
-                                     unsigned head_size,
-                                     unsigned total_count)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned offset = head_id * head_size;
-
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = mixed_query[offset + lane];
-            float k = key_layer[offset + lane];
-            float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            q_rot = g.shfl_xor(q_rot, 1);
-            k_rot = g.shfl_xor(k_rot, 1);
-            q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
-
-            mixed_query[offset + lane] = q;
-            key_layer[offset + lane] = k;
-
-            lane += WARP_SIZE;
-        }
-    }
-}
-
-__global__ void apply_rotary_pos_emb(__half* mixed_query,
-                                     __half* key_layer,
-                                     unsigned rotary_dim,
-                                     unsigned seq_len,
-                                     unsigned seq_offset,
-                                     unsigned num_heads,
-                                     unsigned head_size,
-                                     unsigned total_count)
-{
-#if __CUDA_ARCH__ >= 700
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned offset = head_id * head_size;
-
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = (float)mixed_query[offset + lane];
-            float k = (float)key_layer[offset + lane];
-            float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            q_rot = g.shfl_xor(q_rot, 1);
-            k_rot = g.shfl_xor(k_rot, 1);
-            q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
-
-            mixed_query[offset + lane] = (__half)q;
-            key_layer[offset + lane] = (__half)k;
-
-            lane += WARP_SIZE;
-        }
-    }
-#endif
-}
-__global__ void apply_rotary_pos_emb1(float* mixed_query,
-                                      float* key_layer,
-                                      unsigned rotary_dim,
-                                      unsigned seq_len,
-                                      unsigned seq_offset,
-                                      unsigned num_heads,
-                                      unsigned head_size,
-                                      unsigned total_count)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned offset = head_id * head_size;
-
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = mixed_query[offset + lane];
-            float k = key_layer[offset + lane];
-            float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            q_rot = g.shfl_xor(q_rot, 1);
-            k_rot = g.shfl_xor(k_rot, 1);
-            q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
-
-            mixed_query[offset + lane] = q;
-            key_layer[offset + lane] = k;
-
-            lane += WARP_SIZE;
-        }
-    }
-}
-__global__ void apply_rotary_pos_emb1(__half* mixed_query,
-                                      __half* key_layer,
-                                      unsigned rotary_dim,
-                                      unsigned seq_len,
-                                      unsigned seq_offset,
-                                      unsigned num_heads,
-                                      unsigned head_size,
-                                      unsigned total_count)
-{
-#if __CUDA_ARCH__ >= 700
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned offset = head_id * head_size;
-
-    constexpr unsigned mask[32] = {
-        0x1 | 0x1000,     0x2 | 0x2000,     0x4 | 0x4000,     0x8 | 0x8000,     0x10 | 0x10000,
-        0x20 | 0x20000,   0x40 | 0x40000,   0x80 | 0x80000,   0x100 | 0x100000, 0x200 | 0x200000,
-        0x400 | 0x400000, 0x800 | 0x800000, 0x1000 | 0x1,     0x2000 | 0x2,     0x4000 | 0x4,
-        0x8000 | 0x8,     0x10000 | 0x10,   0x20000 | 0x20,   0x40000 | 0x40,   0x80000 | 0x80,
-        0x100000 | 0x100, 0x200000 | 0x200, 0x400000 | 0x400, 0x800000 | 0x800, 0x1000000,
-        0x2000000,        0x4000000,        0x8000000,        0x10000000,       0x20000000,
-        0x40000000,       0x80000000};
-
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-    unsigned half_dim = rotary_dim >> 1;
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane % half_dim) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = (float)mixed_query[offset + lane];
-            float k = (float)key_layer[offset + lane];
-            float rotary_sign = (lane > (half_dim - 1) ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            auto q_rot_tmp = lane < half_dim ? __shfl_sync(mask[lane], q_rot, lane + half_dim)
-                                             : __shfl_sync(mask[lane], q_rot, lane - half_dim);
-            auto k_rot_tmp = lane < half_dim ? __shfl_sync(mask[lane], k_rot, lane + half_dim)
-                                             : __shfl_sync(mask[lane], k_rot, lane - half_dim);
-            q = q * cosf(inv_freq) + q_rot_tmp * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot_tmp * sinf(inv_freq);
-
-            mixed_query[offset + lane] = (__half)q;
-            key_layer[offset + lane] = (__half)k;
-
-            lane += WARP_SIZE;
-        }
-    }
-#endif
-}
-
-template <typename T>
-void launch_apply_rotary_pos_emb(T* mixed_query,
-                                 T* key_layer,
-                                 unsigned head_size,
-                                 unsigned seq_len,
-                                 unsigned rotary_dim,
-                                 unsigned offset,
-                                 unsigned num_heads,
-                                 unsigned batch,
-                                 bool rotate_half,
-                                 bool rotate_every_two,
-                                 cudaStream_t stream)
-{
-    int total_count = batch * num_heads * seq_len;
-    dim3 block_dims(1024);
-    dim3 grid_dims((total_count - 1) / MAX_WARP_NUM + 1);  // (batch_size);
-    if (rotate_every_two)
-        apply_rotary_pos_emb<<<grid_dims, block_dims, 0, stream>>>(
-            mixed_query, key_layer, rotary_dim, seq_len, offset, num_heads, head_size, total_count);
-    else if (rotate_half)
-        apply_rotary_pos_emb1<<<grid_dims, block_dims, 0, stream>>>(
-            mixed_query, key_layer, rotary_dim, seq_len, offset, num_heads, head_size, total_count);
-}
-
-template void launch_apply_rotary_pos_emb<float>(float*,
-                                                 float*,
-                                                 unsigned,
-                                                 unsigned,
-                                                 unsigned,
-                                                 unsigned,
-                                                 unsigned,
-                                                 unsigned,
-                                                 bool,
-                                                 bool,
-                                                 cudaStream_t);
-template void launch_apply_rotary_pos_emb<__half>(__half*,
-                                                  __half*,
-                                                  unsigned,
-                                                  unsigned,
-                                                  unsigned,
-                                                  unsigned,
-                                                  unsigned,
-                                                  unsigned,
-                                                  bool,
-                                                  bool,
-                                                  cudaStream_t);
-/*
-__global__ void apply_rotary_pos_emb(float* mixed_query,
-float* key_layer,
-unsigned rotary_dim,
-unsigned seq_len,
-unsigned seq_offset,
-unsigned num_heads,
-unsigned head_size,
-unsigned total_count)
-{
-cg::thread_block b = cg::this_thread_block();
-cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-int id = threadIdx.x;
-int gid = id >> 5;
-int lane = id & 0x1f;
-
-unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-unsigned offset = head_id * head_size;
-
-unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-if (head_id < total_count) {
-while (lane < rotary_dim) {
-float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-float q = mixed_query[offset + lane];
-float k = key_layer[offset + lane];
-float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-float q_rot = (q * rotary_sign);
-float k_rot = (k * rotary_sign);
-q_rot = g.shfl_xor(q_rot, 1);
-k_rot = g.shfl_xor(k_rot, 1);
-q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
-
-mixed_query[offset + lane] = q;
-key_layer[offset + lane] = k;
-
-lane += WARP_SIZE;
-}
-}
-}
-
-__global__ void apply_rotary_pos_emb(__half* mixed_query,
-__half* key_layer,
-unsigned rotary_dim,
-unsigned seq_len,
-unsigned seq_offset,
-unsigned num_heads,
-unsigned head_size,
-unsigned total_count)
-{
-#if __CUDA_ARCH__ >= 700
-cg::thread_block b = cg::this_thread_block();
-cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-int id = threadIdx.x;
-int gid = id >> 5;
-int lane = id & 0x1f;
-
-unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-unsigned offset = head_id * head_size;
-constexpr unsigned mask[32] = {0x1 | 0x1000, 0x2 | 0x2000, 0x4 | 0x4000, 0x8 | 0x8000,
-0x10 | 0x10000, 0x20 | 0x20000, 0x40 | 0x40000, 0x80 | 0x80000,
-0x100 | 0x100000, 0x200 | 0x200000, 0x400 | 0x400000, 0x800 | 0x800000,
-0x1000 | 0x1, 0x2000 | 0x2, 0x4000 | 0x4, 0x8000 | 0x8,
-0x10000 | 0x10, 0x20000 | 0x20, 0x40000 | 0x40, 0x80000 | 0x80,
-0x100000 | 0x100, 0x200000 | 0x200, 0x400000 | 0x400, 0x800000 | 0x800,
-0x1000000, 0x2000000, 0x4000000, 0x8000000,
-0x10000000, 0x20000000, 0x40000000, 0x80000000};
-unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-if (head_id < total_count) {
-while (lane < rotary_dim) {
-//float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-float inv_freq = (float)((lane % (rotary_dim >> 1)) * 2) / (float)rotary_dim;
-inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-float q = (float)mixed_query[offset + lane];
-float k = (float)key_layer[offset + lane];
-float rotary_sign = (lane > 11 ? -1.0 : 1.0);
-float q_rot = (q * rotary_sign);
-float k_rot = (k * rotary_sign);
-auto q_rot_tmp = lane < 12 ? __shfl_sync(mask[lane], q_rot, lane + 12) : __shfl_sync(mask[lane],
-q_rot, lane - 12);//g.shfl_xor(q_rot, 12); auto k_rot_tmp = lane < 12 ? __shfl_sync(mask[lane],
-k_rot, lane + 12) : __shfl_sync(mask[lane], k_rot, lane - 12);//g.shfl_xor(k_rot, 12); q = q *
-cosf(inv_freq) + q_rot_tmp * sinf(inv_freq); k = k * cosf(inv_freq) + k_rot_tmp * sinf(inv_freq);
-
-mixed_query[offset + lane] = (__half)q;
-key_layer[offset + lane] = (__half)k;
-
-lane += WARP_SIZE;
-}
-}
-#endif
-}
-
-template <typename T>
-void launch_apply_rotary_pos_emb(T* mixed_query,
-T* key_layer,
-unsigned head_size,
-unsigned seq_len,
-unsigned rotary_dim,
-unsigned offset,
-unsigned num_heads,
-unsigned batch,
-cudaStream_t stream)
-{
-int total_count = batch * num_heads * seq_len;
-dim3 block_dims(1024);
-dim3 grid_dims((total_count - 1) / MAX_WARP_NUM + 1);  // (batch_size);
-
-apply_rotary_pos_emb<<<grid_dims, block_dims, 0, stream>>>(
-mixed_query, key_layer, rotary_dim, seq_len, offset, num_heads, head_size, total_count);
-}
-
-template void launch_apply_rotary_pos_emb<float>(float*,
-float*,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-cudaStream_t);
-template void launch_apply_rotary_pos_emb<__half>(__half*,
-__half*,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-cudaStream_t);
-*/
diff --git a/deepspeed/ops/csrc/transformer/inference/csrc/apply_rotary_pos_emb.hip b/deepspeed/ops/csrc/transformer/inference/csrc/apply_rotary_pos_emb.hip
deleted file mode 100644
index 4e04f7aeb4c80be79c2fe6d8b91a4cc2fecde823..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/inference/csrc/apply_rotary_pos_emb.hip
+++ /dev/null
@@ -1,376 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-
-#ifndef __HIP_PLATFORM_HCC__
-#include <cuda_profiler_api.h>
-#endif
-
-namespace cg = cooperative_groups;
-
-__global__ void apply_rotary_pos_emb(float* mixed_query,
-                                     float* key_layer,
-                                     unsigned rotary_dim,
-                                     unsigned seq_len,
-                                     unsigned seq_offset,
-                                     unsigned num_heads,
-                                     unsigned head_size,
-                                     unsigned total_count)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned offset = head_id * head_size;
-
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = mixed_query[offset + lane];
-            float k = key_layer[offset + lane];
-            float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            q_rot = g.shfl_xor(q_rot, 1);
-            k_rot = g.shfl_xor(k_rot, 1);
-            q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
-
-            mixed_query[offset + lane] = q;
-            key_layer[offset + lane] = k;
-
-            lane += WARP_SIZE;
-        }
-    }
-}
-
-__global__ void apply_rotary_pos_emb(__half* mixed_query,
-                                     __half* key_layer,
-                                     unsigned rotary_dim,
-                                     unsigned seq_len,
-                                     unsigned seq_offset,
-                                     unsigned num_heads,
-                                     unsigned head_size,
-                                     unsigned total_count)
-{
-#if __CUDA_ARCH__ >= 700
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned offset = head_id * head_size;
-
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = (float)mixed_query[offset + lane];
-            float k = (float)key_layer[offset + lane];
-            float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            q_rot = g.shfl_xor(q_rot, 1);
-            k_rot = g.shfl_xor(k_rot, 1);
-            q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
-
-            mixed_query[offset + lane] = (__half)q;
-            key_layer[offset + lane] = (__half)k;
-
-            lane += WARP_SIZE;
-        }
-    }
-#endif
-}
-__global__ void apply_rotary_pos_emb1(float* mixed_query,
-                                      float* key_layer,
-                                      unsigned rotary_dim,
-                                      unsigned seq_len,
-                                      unsigned seq_offset,
-                                      unsigned num_heads,
-                                      unsigned head_size,
-                                      unsigned total_count)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned offset = head_id * head_size;
-
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = mixed_query[offset + lane];
-            float k = key_layer[offset + lane];
-            float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            q_rot = g.shfl_xor(q_rot, 1);
-            k_rot = g.shfl_xor(k_rot, 1);
-            q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
-
-            mixed_query[offset + lane] = q;
-            key_layer[offset + lane] = k;
-
-            lane += WARP_SIZE;
-        }
-    }
-}
-__global__ void apply_rotary_pos_emb1(__half* mixed_query,
-                                      __half* key_layer,
-                                      unsigned rotary_dim,
-                                      unsigned seq_len,
-                                      unsigned seq_offset,
-                                      unsigned num_heads,
-                                      unsigned head_size,
-                                      unsigned total_count)
-{
-#if __CUDA_ARCH__ >= 700
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned offset = head_id * head_size;
-
-    constexpr unsigned mask[32] = {
-        0x1 | 0x1000,     0x2 | 0x2000,     0x4 | 0x4000,     0x8 | 0x8000,     0x10 | 0x10000,
-        0x20 | 0x20000,   0x40 | 0x40000,   0x80 | 0x80000,   0x100 | 0x100000, 0x200 | 0x200000,
-        0x400 | 0x400000, 0x800 | 0x800000, 0x1000 | 0x1,     0x2000 | 0x2,     0x4000 | 0x4,
-        0x8000 | 0x8,     0x10000 | 0x10,   0x20000 | 0x20,   0x40000 | 0x40,   0x80000 | 0x80,
-        0x100000 | 0x100, 0x200000 | 0x200, 0x400000 | 0x400, 0x800000 | 0x800, 0x1000000,
-        0x2000000,        0x4000000,        0x8000000,        0x10000000,       0x20000000,
-        0x40000000,       0x80000000};
-
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-    unsigned half_dim = rotary_dim >> 1;
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane % half_dim) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = (float)mixed_query[offset + lane];
-            float k = (float)key_layer[offset + lane];
-            float rotary_sign = (lane > (half_dim - 1) ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            auto q_rot_tmp = lane < half_dim ? __shfl_sync(mask[lane], q_rot, lane + half_dim)
-                                             : __shfl_sync(mask[lane], q_rot, lane - half_dim);
-            auto k_rot_tmp = lane < half_dim ? __shfl_sync(mask[lane], k_rot, lane + half_dim)
-                                             : __shfl_sync(mask[lane], k_rot, lane - half_dim);
-            q = q * cosf(inv_freq) + q_rot_tmp * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot_tmp * sinf(inv_freq);
-
-            mixed_query[offset + lane] = (__half)q;
-            key_layer[offset + lane] = (__half)k;
-
-            lane += WARP_SIZE;
-        }
-    }
-#endif
-}
-
-template <typename T>
-void launch_apply_rotary_pos_emb(T* mixed_query,
-                                 T* key_layer,
-                                 unsigned head_size,
-                                 unsigned seq_len,
-                                 unsigned rotary_dim,
-                                 unsigned offset,
-                                 unsigned num_heads,
-                                 unsigned batch,
-                                 bool rotate_half,
-                                 bool rotate_every_two,
-                                 hipStream_t stream)
-{
-    int total_count = batch * num_heads * seq_len;
-    dim3 block_dims(1024);
-    dim3 grid_dims((total_count - 1) / MAX_WARP_NUM + 1);  // (batch_size);
-    if (rotate_every_two)
-       hipLaunchKernelGGL(( apply_rotary_pos_emb), dim3(grid_dims), dim3(block_dims), 0, stream, 
-            mixed_query, key_layer, rotary_dim, seq_len, offset, num_heads, head_size, total_count);
-    else if (rotate_half)
-       hipLaunchKernelGGL(( apply_rotary_pos_emb1), dim3(grid_dims), dim3(block_dims), 0, stream, 
-            mixed_query, key_layer, rotary_dim, seq_len, offset, num_heads, head_size, total_count);
-}
-
-template void launch_apply_rotary_pos_emb<float>(float*,
-                                                 float*,
-                                                 unsigned,
-                                                 unsigned,
-                                                 unsigned,
-                                                 unsigned,
-                                                 unsigned,
-                                                 unsigned,
-                                                 bool,
-                                                 bool,
-                                                 hipStream_t);
-template void launch_apply_rotary_pos_emb<__half>(__half*,
-                                                  __half*,
-                                                  unsigned,
-                                                  unsigned,
-                                                  unsigned,
-                                                  unsigned,
-                                                  unsigned,
-                                                  unsigned,
-                                                  bool,
-                                                  bool,
-                                                  hipStream_t);
-/*
-__global__ void apply_rotary_pos_emb(float* mixed_query,
-float* key_layer,
-unsigned rotary_dim,
-unsigned seq_len,
-unsigned seq_offset,
-unsigned num_heads,
-unsigned head_size,
-unsigned total_count)
-{
-cg::thread_block b = cg::this_thread_block();
-cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-int id = threadIdx.x;
-int gid = id >> 5;
-int lane = id & 0x1f;
-
-unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-unsigned offset = head_id * head_size;
-
-unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-if (head_id < total_count) {
-while (lane < rotary_dim) {
-float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-float q = mixed_query[offset + lane];
-float k = key_layer[offset + lane];
-float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-float q_rot = (q * rotary_sign);
-float k_rot = (k * rotary_sign);
-q_rot = g.shfl_xor(q_rot, 1);
-k_rot = g.shfl_xor(k_rot, 1);
-q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
-
-mixed_query[offset + lane] = q;
-key_layer[offset + lane] = k;
-
-lane += WARP_SIZE;
-}
-}
-}
-
-__global__ void apply_rotary_pos_emb(__half* mixed_query,
-__half* key_layer,
-unsigned rotary_dim,
-unsigned seq_len,
-unsigned seq_offset,
-unsigned num_heads,
-unsigned head_size,
-unsigned total_count)
-{
-#if __CUDA_ARCH__ >= 700
-cg::thread_block b = cg::this_thread_block();
-cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-int id = threadIdx.x;
-int gid = id >> 5;
-int lane = id & 0x1f;
-
-unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-unsigned offset = head_id * head_size;
-constexpr unsigned mask[32] = {0x1 | 0x1000, 0x2 | 0x2000, 0x4 | 0x4000, 0x8 | 0x8000,
-0x10 | 0x10000, 0x20 | 0x20000, 0x40 | 0x40000, 0x80 | 0x80000,
-0x100 | 0x100000, 0x200 | 0x200000, 0x400 | 0x400000, 0x800 | 0x800000,
-0x1000 | 0x1, 0x2000 | 0x2, 0x4000 | 0x4, 0x8000 | 0x8,
-0x10000 | 0x10, 0x20000 | 0x20, 0x40000 | 0x40, 0x80000 | 0x80,
-0x100000 | 0x100, 0x200000 | 0x200, 0x400000 | 0x400, 0x800000 | 0x800,
-0x1000000, 0x2000000, 0x4000000, 0x8000000,
-0x10000000, 0x20000000, 0x40000000, 0x80000000};
-unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-if (head_id < total_count) {
-while (lane < rotary_dim) {
-//float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-float inv_freq = (float)((lane % (rotary_dim >> 1)) * 2) / (float)rotary_dim;
-inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-float q = (float)mixed_query[offset + lane];
-float k = (float)key_layer[offset + lane];
-float rotary_sign = (lane > 11 ? -1.0 : 1.0);
-float q_rot = (q * rotary_sign);
-float k_rot = (k * rotary_sign);
-auto q_rot_tmp = lane < 12 ? __shfl_sync(mask[lane], q_rot, lane + 12) : __shfl_sync(mask[lane],
-q_rot, lane - 12);//g.shfl_xor(q_rot, 12); auto k_rot_tmp = lane < 12 ? __shfl_sync(mask[lane],
-k_rot, lane + 12) : __shfl_sync(mask[lane], k_rot, lane - 12);//g.shfl_xor(k_rot, 12); q = q *
-cosf(inv_freq) + q_rot_tmp * sinf(inv_freq); k = k * cosf(inv_freq) + k_rot_tmp * sinf(inv_freq);
-
-mixed_query[offset + lane] = (__half)q;
-key_layer[offset + lane] = (__half)k;
-
-lane += WARP_SIZE;
-}
-}
-#endif
-}
-
-template <typename T>
-void launch_apply_rotary_pos_emb(T* mixed_query,
-T* key_layer,
-unsigned head_size,
-unsigned seq_len,
-unsigned rotary_dim,
-unsigned offset,
-unsigned num_heads,
-unsigned batch,
-hipStream_t stream)
-{
-int total_count = batch * num_heads * seq_len;
-dim3 block_dims(1024);
-dim3 grid_dims((total_count - 1) / MAX_WARP_NUM + 1);  // (batch_size);
-hipLaunchKernelGGL((
-apply_rotary_pos_emb), dim3(grid_dims), dim3(block_dims), 0, stream, 
-mixed_query, key_layer, rotary_dim, seq_len, offset, num_heads, head_size, total_count);
-}
-
-template void launch_apply_rotary_pos_emb<float>(float*,
-float*,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-hipStream_t);
-template void launch_apply_rotary_pos_emb<__half>(__half*,
-__half*,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-hipStream_t);
-*/
diff --git a/deepspeed/ops/csrc/transformer/inference/csrc/dequantize.cu b/deepspeed/ops/csrc/transformer/inference/csrc/dequantize.cu
deleted file mode 100644
index 4ddaabda3eb70b1b958b1fc4c2f959867828d1a2..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/inference/csrc/dequantize.cu
+++ /dev/null
@@ -1,110 +0,0 @@
-#include "custom_cuda_layers.h"
-
-#define MAX_QUANTIZE_GROUPING 1024
-
-#define loop_unroll 1
-#define loop_unroll_bits 1
-
-__global__ void dequantize_kernel(float* output,
-                                  const int8_t* input,
-                                  const float* qscale,
-                                  int output_size,
-                                  int hidden_dim,
-                                  int groups,
-                                  int merge_count)
-{
-    unsigned merge_hidden = hidden_dim >> merge_count;
-    unsigned quantization_stride = (merge_hidden * output_size) / groups;
-
-    unsigned bid = blockIdx.x;
-    unsigned tid = threadIdx.x;
-
-    while (tid < output_size) {
-        unsigned w_index = bid / merge_hidden;
-        unsigned q_index = tid + bid * output_size;
-
-        auto q = input[q_index];
-
-        unsigned merge_hidden_total = w_index * merge_hidden;
-        unsigned scale_index =
-            ((((bid - merge_hidden_total) + tid * merge_hidden) / quantization_stride)
-             << merge_count) +
-            w_index;
-
-        float scale_data = qscale[scale_index];
-
-        output[q_index] = (scale_data * (float)q);
-        tid += blockDim.x;
-    }
-}
-
-__global__ void dequantize_kernel(__half* output,
-                                  const int8_t* input,
-                                  const float* qscale,
-                                  unsigned output_size,
-                                  unsigned hidden_dim,
-                                  unsigned groups,
-                                  unsigned merge_count)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    unsigned merge_hidden = hidden_dim >> merge_count;
-    unsigned quantization_stride = (merge_hidden * output_size) / groups;
-
-    unsigned bid = blockIdx.x;
-    unsigned tid = threadIdx.x;
-
-    while (tid < output_size) {
-        unsigned w_index = bid / merge_hidden;
-        unsigned q_index = tid + bid * output_size;
-
-        auto q = input[q_index];
-
-        unsigned merge_hidden_total = w_index * merge_hidden;
-        unsigned scale_index =
-            ((((bid - merge_hidden_total) + tid * merge_hidden) / quantization_stride)
-             << merge_count) +
-            w_index;
-
-        float scale_data = qscale[scale_index];
-
-        output[q_index] = __float2half(scale_data * (float)q);
-        tid += blockDim.x;
-    }
-#endif
-}
-
-template <typename T>
-void launch_dequantize(T* output,
-                       const int8_t* input,
-                       const float* qscale,
-                       unsigned output_size,
-                       unsigned hidden_dim,
-                       unsigned groups,
-                       unsigned merge_count,
-                       cudaStream_t stream)
-{
-    unsigned threads = 1024;
-    dim3 block_dims(threads);
-    dim3 grid_dims(hidden_dim);
-
-    dequantize_kernel<<<grid_dims, block_dims, 0, stream>>>(
-        output, input, qscale, output_size, hidden_dim, groups, merge_count);
-}
-
-template void launch_dequantize<float>(float*,
-                                       const int8_t*,
-                                       const float*,
-                                       unsigned,
-                                       unsigned,
-                                       unsigned,
-                                       unsigned,
-                                       cudaStream_t);
-template void launch_dequantize<__half>(__half*,
-                                        const int8_t*,
-                                        const float*,
-                                        unsigned,
-                                        unsigned,
-                                        unsigned,
-                                        unsigned,
-                                        cudaStream_t);
diff --git a/deepspeed/ops/csrc/transformer/inference/csrc/dequantize.hip b/deepspeed/ops/csrc/transformer/inference/csrc/dequantize.hip
deleted file mode 100644
index 7c22e306aace1058947ed47e58c0427a4f066ecb..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/inference/csrc/dequantize.hip
+++ /dev/null
@@ -1,112 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-
-#define MAX_QUANTIZE_GROUPING 1024
-
-#define loop_unroll 1
-#define loop_unroll_bits 1
-
-__global__ void dequantize_kernel(float* output,
-                                  const int8_t* input,
-                                  const float* qscale,
-                                  int output_size,
-                                  int hidden_dim,
-                                  int groups,
-                                  int merge_count)
-{
-    unsigned merge_hidden = hidden_dim >> merge_count;
-    unsigned quantization_stride = (merge_hidden * output_size) / groups;
-
-    unsigned bid = blockIdx.x;
-    unsigned tid = threadIdx.x;
-
-    while (tid < output_size) {
-        unsigned w_index = bid / merge_hidden;
-        unsigned q_index = tid + bid * output_size;
-
-        auto q = input[q_index];
-
-        unsigned merge_hidden_total = w_index * merge_hidden;
-        unsigned scale_index =
-            ((((bid - merge_hidden_total) + tid * merge_hidden) / quantization_stride)
-             << merge_count) +
-            w_index;
-
-        float scale_data = qscale[scale_index];
-
-        output[q_index] = (scale_data * (float)q);
-        tid += blockDim.x;
-    }
-}
-
-__global__ void dequantize_kernel(__half* output,
-                                  const int8_t* input,
-                                  const float* qscale,
-                                  unsigned output_size,
-                                  unsigned hidden_dim,
-                                  unsigned groups,
-                                  unsigned merge_count)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    unsigned merge_hidden = hidden_dim >> merge_count;
-    unsigned quantization_stride = (merge_hidden * output_size) / groups;
-
-    unsigned bid = blockIdx.x;
-    unsigned tid = threadIdx.x;
-
-    while (tid < output_size) {
-        unsigned w_index = bid / merge_hidden;
-        unsigned q_index = tid + bid * output_size;
-
-        auto q = input[q_index];
-
-        unsigned merge_hidden_total = w_index * merge_hidden;
-        unsigned scale_index =
-            ((((bid - merge_hidden_total) + tid * merge_hidden) / quantization_stride)
-             << merge_count) +
-            w_index;
-
-        float scale_data = qscale[scale_index];
-
-        output[q_index] = __float2half(scale_data * (float)q);
-        tid += blockDim.x;
-    }
-#endif
-}
-
-template <typename T>
-void launch_dequantize(T* output,
-                       const int8_t* input,
-                       const float* qscale,
-                       unsigned output_size,
-                       unsigned hidden_dim,
-                       unsigned groups,
-                       unsigned merge_count,
-                       hipStream_t stream)
-{
-    unsigned threads = 1024;
-    dim3 block_dims(threads);
-    dim3 grid_dims(hidden_dim);
-
-   hipLaunchKernelGGL(( dequantize_kernel), dim3(grid_dims), dim3(block_dims), 0, stream, 
-        output, input, qscale, output_size, hidden_dim, groups, merge_count);
-}
-
-template void launch_dequantize<float>(float*,
-                                       const int8_t*,
-                                       const float*,
-                                       unsigned,
-                                       unsigned,
-                                       unsigned,
-                                       unsigned,
-                                       hipStream_t);
-template void launch_dequantize<__half>(__half*,
-                                        const int8_t*,
-                                        const float*,
-                                        unsigned,
-                                        unsigned,
-                                        unsigned,
-                                        unsigned,
-                                        hipStream_t);
diff --git a/deepspeed/ops/csrc/transformer/inference/csrc/gelu.cu b/deepspeed/ops/csrc/transformer/inference/csrc/gelu.cu
deleted file mode 100644
index 70bbf42cf9ed74558ce1b789d939c17d38573a86..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/inference/csrc/gelu.cu
+++ /dev/null
@@ -1,525 +0,0 @@
-#include "custom_cuda_layers.h"
-
-#define MAX_CAP 4
-#define MAX_SEQ 2048
-
-inline __device__ float gelu(const float x)
-{
-    const float sqrt_param = 0.79788456080286535587989211986876f;
-    const float mul_param = 0.044715;
-    return x * 0.5f * (1.0f + tanhf(sqrt_param * (x + mul_param * x * x * x)));
-}
-
-__global__ void fused_bias_gelu(float* input,
-                                const float* bias,
-                                int total_count,
-                                int intermediate_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 bias_data = bias_cast[offset % intermediate_size];
-
-        data.x += bias_data.x;
-        data.y += bias_data.y;
-        data.z += bias_data.z;
-        data.w += bias_data.w;
-
-        data.x = gelu(data.x);
-        data.y = gelu(data.y);
-        data.z = gelu(data.z);
-        data.w = gelu(data.w);
-
-        input_cast[offset] = data;
-    }
-}
-
-__global__ void fused_bias_gelu(__half* input,
-                                const __half* bias,
-                                int total_count,
-                                int intermediate_size)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 bias_vec = bias_cast[offset % intermediate_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        low_data.x += low_bias.x;
-        low_data.y += low_bias.y;
-        high_data.x += high_bias.x;
-        high_data.y += high_bias.y;
-
-        low_data.x = gelu(low_data.x);
-        low_data.y = gelu(low_data.y);
-        high_data.x = gelu(high_data.x);
-        high_data.y = gelu(high_data.y);
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        input_cast[offset] = vals_vec;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_gelu(T* input,
-                      const T* bias,
-                      int intermediate_size,
-                      int batch_size,
-                      cudaStream_t stream)
-{
-    int total_count = batch_size * (intermediate_size / 4);
-    int threads = 1024;  // intermediate_size / iterations / 4;
-    dim3 block_dims(threads);
-    dim3 grid_dims(((total_count - 1) / 1024 + 1));  // (batch_size);
-
-    fused_bias_gelu<<<grid_dims, block_dims, 0, stream>>>(
-        input, bias, total_count, intermediate_size / 4);
-}
-
-template void launch_bias_gelu<float>(float*, const float*, int, int, cudaStream_t);
-template void launch_bias_gelu<__half>(__half*, const __half*, int, int, cudaStream_t);
-
-__global__ void fused_bias_add(float* input, const float* bias, int total_count, int hidden_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 bias_data = bias_cast[offset % hidden_size];
-
-        data.x += bias_data.x;
-        data.y += bias_data.y;
-        data.z += bias_data.z;
-        data.w += bias_data.w;
-
-        input_cast[offset] = data;
-    }
-}
-
-__global__ void fused_bias_add(__half* input, const __half* bias, int total_count, int hidden_size)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 bias_vec = bias_cast[offset % hidden_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        low_data.x += low_bias.x;
-        low_data.y += low_bias.y;
-        high_data.x += high_bias.x;
-        high_data.y += high_bias.y;
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        input_cast[offset] = vals_vec;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, cudaStream_t stream)
-{
-    int total_count = batch_size * (hidden_size / 4);
-    int threads = 1024;  // hidden_size / iterations / 4;
-    dim3 block_dims(threads);
-    dim3 grid_dims(((total_count - 1) / threads + 1));  // (batch_size);
-
-    fused_bias_add<<<grid_dims, block_dims, 0, stream>>>(input, bias, total_count, hidden_size / 4);
-}
-
-template void launch_bias_add<float>(float*, const float*, int, int, cudaStream_t);
-template void launch_bias_add<__half>(__half*, const __half*, int, int, cudaStream_t);
-
-__global__ void fused_bias_residual(float* input,
-                                    float* output,
-                                    float* attn,
-                                    float* bias,
-                                    float* attnbias,
-                                    int total_count,
-                                    int intermediate_size,
-                                    int mp_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    float4* output_cast = reinterpret_cast<float4*>(output);
-    float4* attn_cast = reinterpret_cast<float4*>(attn);
-    float4* bias_cast = reinterpret_cast<float4*>(bias);
-    float4* attnbias_cast = reinterpret_cast<float4*>(attnbias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 out = output_cast[offset];
-        float4 res_vec = attn_cast[offset];
-        float4 bias_data = bias_cast[offset % intermediate_size];
-        float4 attn_bias = attnbias_cast[offset % intermediate_size];
-
-        data.x = (data.x + res_vec.x) * mp_size + (out.x + bias_data.x + attn_bias.x);
-        data.y = (data.y + res_vec.y) * mp_size + (out.y + bias_data.y + attn_bias.y);
-        data.z = (data.z + res_vec.z) * mp_size + (out.z + bias_data.z + attn_bias.z);
-        data.w = (data.w + res_vec.w) * mp_size + (out.w + bias_data.w + attn_bias.w);
-
-        output_cast[offset] = data;
-    }
-}
-
-__global__ void fused_bias_residual(__half* input,
-                                    __half* output,
-                                    __half* attn,
-                                    __half* bias,
-                                    __half* attn_bias,
-                                    int total_count,
-                                    int intermediate_size,
-                                    int mp_size)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    float2* output_cast = reinterpret_cast<float2*>(output);
-    float2* attn_cast = reinterpret_cast<float2*>(attn);
-
-    float2* bias_cast = reinterpret_cast<float2*>(bias);
-    float2* attnbias_cast = reinterpret_cast<float2*>(attn_bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 out_vec = output_cast[offset];
-        float2 res_vec = attn_cast[offset];
-
-        float2 bias_vec = bias_cast[offset % intermediate_size];
-        float2 attn_bias_vec = attnbias_cast[offset % intermediate_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* out_half = reinterpret_cast<__half2*>(&out_vec);
-        __half2* res_half = reinterpret_cast<__half2*>(&res_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-        __half2* attnbias_half = reinterpret_cast<__half2*>(&attn_bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_out = __half22float2(out_half[0]);
-        float2 high_out = __half22float2(out_half[1]);
-
-        float2 low_res = __half22float2(res_half[0]);
-        float2 high_res = __half22float2(res_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        float2 attn_low_bias = __half22float2(attnbias_half[0]);
-        float2 attn_high_bias = __half22float2(attnbias_half[1]);
-
-        low_data.x =
-            (low_data.x + low_res.x) * mp_size + (low_out.x + (low_bias.x + attn_low_bias.x));
-        low_data.y =
-            (low_data.y + low_res.y) * mp_size + (low_out.y + (low_bias.y + attn_low_bias.y));
-        high_data.x =
-            (high_data.x + high_res.x) * mp_size + (high_out.x + (high_bias.x + attn_high_bias.x));
-        high_data.y =
-            (high_data.y + high_res.y) * mp_size + (high_out.y + (high_bias.y + attn_high_bias.y));
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        output_cast[offset] = vals_vec;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_residual(T* input,
-                          T* output,
-                          T* attn,
-                          T* bias,
-                          T* attn_bias,
-                          int batch,
-                          int hidden_dim,
-                          int mp_size,
-                          cudaStream_t stream)
-{
-    int total_count = batch * hidden_dim / 4;
-    dim3 block_dims(1024);
-    dim3 grid_dims((total_count - 1) / 1024 + 1);  // (batch_size);
-
-    fused_bias_residual<<<grid_dims, block_dims, 0, stream>>>(
-        input, output, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size);
-}
-
-template void
-launch_bias_residual<float>(float*, float*, float*, float*, float*, int, int, int, cudaStream_t);
-template void launch_bias_residual<__half>(__half*,
-                                           __half*,
-                                           __half*,
-                                           __half*,
-                                           __half*,
-                                           int,
-                                           int,
-                                           int,
-                                           cudaStream_t);
-
-__global__ void gptj_residual_add(float* input,
-                                  float* output,
-                                  float* attn,
-                                  float* bias,
-                                  float* attnbias,
-                                  int total_count,
-                                  int intermediate_size,
-                                  float mp_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    float4* output_cast = reinterpret_cast<float4*>(output);
-    float4* attn_cast = reinterpret_cast<float4*>(attn);
-    float4* bias_cast = reinterpret_cast<float4*>(bias);
-    float4* attnbias_cast = reinterpret_cast<float4*>(attnbias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 out = output_cast[offset];
-        float4 res_vec = attn_cast[offset];
-        float4 bias_data = bias_cast[offset % intermediate_size];
-        float4 attn_bias = attnbias_cast[offset % intermediate_size];
-
-        data.x = data.x * mp_size + (out.x + res_vec.x + bias_data.x + attn_bias.x);
-        data.y = data.y * mp_size + (out.y + res_vec.y + bias_data.y + attn_bias.y);
-        data.z = data.z * mp_size + (out.z + res_vec.z + bias_data.z + attn_bias.z);
-        data.w = data.w * mp_size + (out.w + res_vec.w + bias_data.w + attn_bias.w);
-
-        output_cast[offset] = data;
-    }
-}
-
-__global__ void gptj_residual_add(__half* input,
-                                  __half* output,
-                                  __half* attn,
-                                  __half* bias,
-                                  __half* attn_bias,
-                                  int total_count,
-                                  int intermediate_size,
-                                  float mp_size)
-{
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    float2* output_cast = reinterpret_cast<float2*>(output);
-    float2* attn_cast = reinterpret_cast<float2*>(attn);
-
-    float2* bias_cast = reinterpret_cast<float2*>(bias);
-    float2* attnbias_cast = reinterpret_cast<float2*>(attn_bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 out_vec = output_cast[offset];
-        float2 res_vec = attn_cast[offset];
-
-        float2 bias_vec = bias_cast[offset % intermediate_size];
-        float2 attn_bias_vec = attnbias_cast[offset % intermediate_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* out_half = reinterpret_cast<__half2*>(&out_vec);
-        __half2* res_half = reinterpret_cast<__half2*>(&res_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-        __half2* attnbias_half = reinterpret_cast<__half2*>(&attn_bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_out = __half22float2(out_half[0]);
-        float2 high_out = __half22float2(out_half[1]);
-
-        float2 low_res = __half22float2(res_half[0]);
-        float2 high_res = __half22float2(res_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        float2 attn_low_bias = __half22float2(attnbias_half[0]);
-        float2 attn_high_bias = __half22float2(attnbias_half[1]);
-
-        low_data.x =
-            low_data.x * mp_size + (low_out.x + low_res.x + (low_bias.x + attn_low_bias.x));
-        low_data.y =
-            low_data.y * mp_size + (low_out.y + low_res.y + (low_bias.y + attn_low_bias.y));
-        high_data.x =
-            high_data.x * mp_size + (high_out.x + high_res.x + (high_bias.x + attn_high_bias.x));
-        high_data.y =
-            high_data.y * mp_size + (high_out.y + high_res.y + (high_bias.y + attn_high_bias.y));
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        output_cast[offset] = vals_vec;
-    }
-#endif
-}
-
-template <typename T>
-void launch_gptj_residual_add(T* input,
-                              T* output,
-                              T* attn,
-                              T* bias,
-                              T* attn_bias,
-                              int hidden_dim,
-                              int batch,
-                              int mp_size,
-                              cudaStream_t stream)
-{
-    int total_count = batch * hidden_dim / 4;
-    dim3 block_dims(1024);
-    dim3 grid_dims((total_count - 1) / 1024 + 1);  // (batch_size);
-
-    gptj_residual_add<<<grid_dims, block_dims, 0, stream>>>(
-        input, output, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size);
-}
-
-template void launch_gptj_residual_add<float>(float*,
-                                              float*,
-                                              float*,
-                                              float*,
-                                              float*,
-                                              int,
-                                              int,
-                                              int,
-                                              cudaStream_t);
-template void launch_gptj_residual_add<__half>(__half*,
-                                               __half*,
-                                               __half*,
-                                               __half*,
-                                               __half*,
-                                               int,
-                                               int,
-                                               int,
-                                               cudaStream_t);
-
-__global__ void moe_res_matmul(float* residual,
-                               float* coef,
-                               float* mlp_out,
-                               int seq_len,
-                               int hidden_dim)
-{
-    unsigned tid = threadIdx.x;
-    float4* residual_cast = reinterpret_cast<float4*>(residual);
-    float4* coef_cast = reinterpret_cast<float4*>(coef);
-    float4* mlp_out_cast = reinterpret_cast<float4*>(mlp_out);
-
-    residual_cast += blockIdx.x * hidden_dim;
-    mlp_out_cast += blockIdx.x * hidden_dim;
-
-    float4* coef_cast2 = coef_cast + hidden_dim;
-
-    while (tid < hidden_dim) {
-        float4 res = residual_cast[tid];
-        float4 mlp = mlp_out_cast[tid];
-        float4 coef1 = coef_cast[tid];
-        float4 coef2 = coef_cast2[tid];
-        mlp.x = mlp.x * coef2.x + res.x * coef1.x;
-        mlp.y = mlp.y * coef2.y + res.y * coef1.y;
-        mlp.z = mlp.z * coef2.z + res.z * coef1.z;
-        mlp.w = mlp.w * coef2.w + res.w * coef1.w;
-        mlp_out_cast[tid] = mlp;
-        tid += blockDim.x;
-    }
-}
-
-__global__ void moe_res_matmul(__half* residual,
-                               __half* coef,
-                               __half* mlp_out,
-                               int seq_len,
-                               int hidden_dim)
-{
-    unsigned tid = threadIdx.x;
-
-    float2* residual_cast = reinterpret_cast<float2*>(residual);
-    float2* mlp_out_cast = reinterpret_cast<float2*>(mlp_out);
-    float2* coef_cast = reinterpret_cast<float2*>(coef);
-    float2* coef_cast2 = coef_cast + hidden_dim;
-
-    residual_cast += blockIdx.x * hidden_dim;
-    mlp_out_cast += blockIdx.x * hidden_dim;
-
-    while (tid < hidden_dim) {
-        float2 res = residual_cast[tid];
-        float2 coef1 = coef_cast[tid];
-        float2 coef2 = coef_cast[tid];
-        float2 data = mlp_out_cast[tid];
-        __half* data_h = reinterpret_cast<__half*>(&data);
-        __half* coef1_h = reinterpret_cast<__half*>(&coef1);
-        __half* coef2_h = reinterpret_cast<__half*>(&coef2);
-        __half* res_h = reinterpret_cast<__half*>(&res);
-        data_h[0] = res_h[0] * coef1_h[0] + data_h[0] * coef2_h[0];
-        data_h[1] = res_h[1] * coef1_h[1] + data_h[1] * coef2_h[1];
-        data_h[2] = res_h[2] * coef1_h[2] + data_h[2] * coef2_h[2];
-        data_h[3] = res_h[3] * coef1_h[3] + data_h[3] * coef2_h[3];
-
-        mlp_out_cast[tid] = data;
-        tid += blockDim.x;
-    }
-}
-
-template <typename T>
-void launch_moe_res_matmul(T* residual,
-                           T* coef,
-                           T* mlp_out,
-                           int seq_len,
-                           int hidden_dim,
-                           cudaStream_t stream)
-{
-    dim3 grid_dim(seq_len);
-    dim3 block_dim(1024);
-    moe_res_matmul<<<grid_dim, block_dim, 0, stream>>>(
-        residual, coef, mlp_out, seq_len, hidden_dim / 4);
-}
-
-template void launch_moe_res_matmul(float* residual,
-                                    float* coef,
-                                    float* mlp_out,
-                                    int seq_len,
-                                    int hidden_dim,
-                                    cudaStream_t stream);
-template void launch_moe_res_matmul(__half* residual,
-                                    __half* coef,
-                                    __half* mlp_out,
-                                    int seq_len,
-                                    int hidden_dim,
-                                    cudaStream_t stream);
diff --git a/deepspeed/ops/csrc/transformer/inference/csrc/gelu.hip b/deepspeed/ops/csrc/transformer/inference/csrc/gelu.hip
deleted file mode 100644
index 00c03efb9b6b3d7b05f19549472b5b771f46e1f4..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/inference/csrc/gelu.hip
+++ /dev/null
@@ -1,527 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-
-#define MAX_CAP 4
-#define MAX_SEQ 2048
-
-inline __device__ float gelu(const float x)
-{
-    const float sqrt_param = 0.79788456080286535587989211986876f;
-    const float mul_param = 0.044715;
-    return x * 0.5f * (1.0f + tanhf(sqrt_param * (x + mul_param * x * x * x)));
-}
-
-__global__ void fused_bias_gelu(float* input,
-                                const float* bias,
-                                int total_count,
-                                int intermediate_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 bias_data = bias_cast[offset % intermediate_size];
-
-        data.x += bias_data.x;
-        data.y += bias_data.y;
-        data.z += bias_data.z;
-        data.w += bias_data.w;
-
-        data.x = gelu(data.x);
-        data.y = gelu(data.y);
-        data.z = gelu(data.z);
-        data.w = gelu(data.w);
-
-        input_cast[offset] = data;
-    }
-}
-
-__global__ void fused_bias_gelu(__half* input,
-                                const __half* bias,
-                                int total_count,
-                                int intermediate_size)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 bias_vec = bias_cast[offset % intermediate_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        low_data.x += low_bias.x;
-        low_data.y += low_bias.y;
-        high_data.x += high_bias.x;
-        high_data.y += high_bias.y;
-
-        low_data.x = gelu(low_data.x);
-        low_data.y = gelu(low_data.y);
-        high_data.x = gelu(high_data.x);
-        high_data.y = gelu(high_data.y);
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        input_cast[offset] = vals_vec;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_gelu(T* input,
-                      const T* bias,
-                      int intermediate_size,
-                      int batch_size,
-                      hipStream_t stream)
-{
-    int total_count = batch_size * (intermediate_size / 4);
-    int threads = 1024;  // intermediate_size / iterations / 4;
-    dim3 block_dims(threads);
-    dim3 grid_dims(((total_count - 1) / 1024 + 1));  // (batch_size);
-
-   hipLaunchKernelGGL(( fused_bias_gelu), dim3(grid_dims), dim3(block_dims), 0, stream, 
-        input, bias, total_count, intermediate_size / 4);
-}
-
-template void launch_bias_gelu<float>(float*, const float*, int, int, hipStream_t);
-template void launch_bias_gelu<__half>(__half*, const __half*, int, int, hipStream_t);
-
-__global__ void fused_bias_add(float* input, const float* bias, int total_count, int hidden_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 bias_data = bias_cast[offset % hidden_size];
-
-        data.x += bias_data.x;
-        data.y += bias_data.y;
-        data.z += bias_data.z;
-        data.w += bias_data.w;
-
-        input_cast[offset] = data;
-    }
-}
-
-__global__ void fused_bias_add(__half* input, const __half* bias, int total_count, int hidden_size)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 bias_vec = bias_cast[offset % hidden_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        low_data.x += low_bias.x;
-        low_data.y += low_bias.y;
-        high_data.x += high_bias.x;
-        high_data.y += high_bias.y;
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        input_cast[offset] = vals_vec;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, hipStream_t stream)
-{
-    int total_count = batch_size * (hidden_size / 4);
-    int threads = 1024;  // hidden_size / iterations / 4;
-    dim3 block_dims(threads);
-    dim3 grid_dims(((total_count - 1) / threads + 1));  // (batch_size);
-
-   hipLaunchKernelGGL(( fused_bias_add), dim3(grid_dims), dim3(block_dims), 0, stream, input, bias, total_count, hidden_size / 4);
-}
-
-template void launch_bias_add<float>(float*, const float*, int, int, hipStream_t);
-template void launch_bias_add<__half>(__half*, const __half*, int, int, hipStream_t);
-
-__global__ void fused_bias_residual(float* input,
-                                    float* output,
-                                    float* attn,
-                                    float* bias,
-                                    float* attnbias,
-                                    int total_count,
-                                    int intermediate_size,
-                                    int mp_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    float4* output_cast = reinterpret_cast<float4*>(output);
-    float4* attn_cast = reinterpret_cast<float4*>(attn);
-    float4* bias_cast = reinterpret_cast<float4*>(bias);
-    float4* attnbias_cast = reinterpret_cast<float4*>(attnbias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 out = output_cast[offset];
-        float4 res_vec = attn_cast[offset];
-        float4 bias_data = bias_cast[offset % intermediate_size];
-        float4 attn_bias = attnbias_cast[offset % intermediate_size];
-
-        data.x = (data.x + res_vec.x) * mp_size + (out.x + bias_data.x + attn_bias.x);
-        data.y = (data.y + res_vec.y) * mp_size + (out.y + bias_data.y + attn_bias.y);
-        data.z = (data.z + res_vec.z) * mp_size + (out.z + bias_data.z + attn_bias.z);
-        data.w = (data.w + res_vec.w) * mp_size + (out.w + bias_data.w + attn_bias.w);
-
-        output_cast[offset] = data;
-    }
-}
-
-__global__ void fused_bias_residual(__half* input,
-                                    __half* output,
-                                    __half* attn,
-                                    __half* bias,
-                                    __half* attn_bias,
-                                    int total_count,
-                                    int intermediate_size,
-                                    int mp_size)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    float2* output_cast = reinterpret_cast<float2*>(output);
-    float2* attn_cast = reinterpret_cast<float2*>(attn);
-
-    float2* bias_cast = reinterpret_cast<float2*>(bias);
-    float2* attnbias_cast = reinterpret_cast<float2*>(attn_bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 out_vec = output_cast[offset];
-        float2 res_vec = attn_cast[offset];
-
-        float2 bias_vec = bias_cast[offset % intermediate_size];
-        float2 attn_bias_vec = attnbias_cast[offset % intermediate_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* out_half = reinterpret_cast<__half2*>(&out_vec);
-        __half2* res_half = reinterpret_cast<__half2*>(&res_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-        __half2* attnbias_half = reinterpret_cast<__half2*>(&attn_bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_out = __half22float2(out_half[0]);
-        float2 high_out = __half22float2(out_half[1]);
-
-        float2 low_res = __half22float2(res_half[0]);
-        float2 high_res = __half22float2(res_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        float2 attn_low_bias = __half22float2(attnbias_half[0]);
-        float2 attn_high_bias = __half22float2(attnbias_half[1]);
-
-        low_data.x =
-            (low_data.x + low_res.x) * mp_size + (low_out.x + (low_bias.x + attn_low_bias.x));
-        low_data.y =
-            (low_data.y + low_res.y) * mp_size + (low_out.y + (low_bias.y + attn_low_bias.y));
-        high_data.x =
-            (high_data.x + high_res.x) * mp_size + (high_out.x + (high_bias.x + attn_high_bias.x));
-        high_data.y =
-            (high_data.y + high_res.y) * mp_size + (high_out.y + (high_bias.y + attn_high_bias.y));
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        output_cast[offset] = vals_vec;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_residual(T* input,
-                          T* output,
-                          T* attn,
-                          T* bias,
-                          T* attn_bias,
-                          int batch,
-                          int hidden_dim,
-                          int mp_size,
-                          hipStream_t stream)
-{
-    int total_count = batch * hidden_dim / 4;
-    dim3 block_dims(1024);
-    dim3 grid_dims((total_count - 1) / 1024 + 1);  // (batch_size);
-
-   hipLaunchKernelGGL(( fused_bias_residual), dim3(grid_dims), dim3(block_dims), 0, stream, 
-        input, output, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size);
-}
-
-template void
-launch_bias_residual<float>(float*, float*, float*, float*, float*, int, int, int, hipStream_t);
-template void launch_bias_residual<__half>(__half*,
-                                           __half*,
-                                           __half*,
-                                           __half*,
-                                           __half*,
-                                           int,
-                                           int,
-                                           int,
-                                           hipStream_t);
-
-__global__ void gptj_residual_add(float* input,
-                                  float* output,
-                                  float* attn,
-                                  float* bias,
-                                  float* attnbias,
-                                  int total_count,
-                                  int intermediate_size,
-                                  float mp_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    float4* output_cast = reinterpret_cast<float4*>(output);
-    float4* attn_cast = reinterpret_cast<float4*>(attn);
-    float4* bias_cast = reinterpret_cast<float4*>(bias);
-    float4* attnbias_cast = reinterpret_cast<float4*>(attnbias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 out = output_cast[offset];
-        float4 res_vec = attn_cast[offset];
-        float4 bias_data = bias_cast[offset % intermediate_size];
-        float4 attn_bias = attnbias_cast[offset % intermediate_size];
-
-        data.x = data.x * mp_size + (out.x + res_vec.x + bias_data.x + attn_bias.x);
-        data.y = data.y * mp_size + (out.y + res_vec.y + bias_data.y + attn_bias.y);
-        data.z = data.z * mp_size + (out.z + res_vec.z + bias_data.z + attn_bias.z);
-        data.w = data.w * mp_size + (out.w + res_vec.w + bias_data.w + attn_bias.w);
-
-        output_cast[offset] = data;
-    }
-}
-
-__global__ void gptj_residual_add(__half* input,
-                                  __half* output,
-                                  __half* attn,
-                                  __half* bias,
-                                  __half* attn_bias,
-                                  int total_count,
-                                  int intermediate_size,
-                                  float mp_size)
-{
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    float2* output_cast = reinterpret_cast<float2*>(output);
-    float2* attn_cast = reinterpret_cast<float2*>(attn);
-
-    float2* bias_cast = reinterpret_cast<float2*>(bias);
-    float2* attnbias_cast = reinterpret_cast<float2*>(attn_bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 out_vec = output_cast[offset];
-        float2 res_vec = attn_cast[offset];
-
-        float2 bias_vec = bias_cast[offset % intermediate_size];
-        float2 attn_bias_vec = attnbias_cast[offset % intermediate_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* out_half = reinterpret_cast<__half2*>(&out_vec);
-        __half2* res_half = reinterpret_cast<__half2*>(&res_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-        __half2* attnbias_half = reinterpret_cast<__half2*>(&attn_bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_out = __half22float2(out_half[0]);
-        float2 high_out = __half22float2(out_half[1]);
-
-        float2 low_res = __half22float2(res_half[0]);
-        float2 high_res = __half22float2(res_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        float2 attn_low_bias = __half22float2(attnbias_half[0]);
-        float2 attn_high_bias = __half22float2(attnbias_half[1]);
-
-        low_data.x =
-            low_data.x * mp_size + (low_out.x + low_res.x + (low_bias.x + attn_low_bias.x));
-        low_data.y =
-            low_data.y * mp_size + (low_out.y + low_res.y + (low_bias.y + attn_low_bias.y));
-        high_data.x =
-            high_data.x * mp_size + (high_out.x + high_res.x + (high_bias.x + attn_high_bias.x));
-        high_data.y =
-            high_data.y * mp_size + (high_out.y + high_res.y + (high_bias.y + attn_high_bias.y));
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        output_cast[offset] = vals_vec;
-    }
-#endif
-}
-
-template <typename T>
-void launch_gptj_residual_add(T* input,
-                              T* output,
-                              T* attn,
-                              T* bias,
-                              T* attn_bias,
-                              int hidden_dim,
-                              int batch,
-                              int mp_size,
-                              hipStream_t stream)
-{
-    int total_count = batch * hidden_dim / 4;
-    dim3 block_dims(1024);
-    dim3 grid_dims((total_count - 1) / 1024 + 1);  // (batch_size);
-
-   hipLaunchKernelGGL(( gptj_residual_add), dim3(grid_dims), dim3(block_dims), 0, stream, 
-        input, output, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size);
-}
-
-template void launch_gptj_residual_add<float>(float*,
-                                              float*,
-                                              float*,
-                                              float*,
-                                              float*,
-                                              int,
-                                              int,
-                                              int,
-                                              hipStream_t);
-template void launch_gptj_residual_add<__half>(__half*,
-                                               __half*,
-                                               __half*,
-                                               __half*,
-                                               __half*,
-                                               int,
-                                               int,
-                                               int,
-                                               hipStream_t);
-
-__global__ void moe_res_matmul(float* residual,
-                               float* coef,
-                               float* mlp_out,
-                               int seq_len,
-                               int hidden_dim)
-{
-    unsigned tid = threadIdx.x;
-    float4* residual_cast = reinterpret_cast<float4*>(residual);
-    float4* coef_cast = reinterpret_cast<float4*>(coef);
-    float4* mlp_out_cast = reinterpret_cast<float4*>(mlp_out);
-
-    residual_cast += blockIdx.x * hidden_dim;
-    mlp_out_cast += blockIdx.x * hidden_dim;
-
-    float4* coef_cast2 = coef_cast + hidden_dim;
-
-    while (tid < hidden_dim) {
-        float4 res = residual_cast[tid];
-        float4 mlp = mlp_out_cast[tid];
-        float4 coef1 = coef_cast[tid];
-        float4 coef2 = coef_cast2[tid];
-        mlp.x = mlp.x * coef2.x + res.x * coef1.x;
-        mlp.y = mlp.y * coef2.y + res.y * coef1.y;
-        mlp.z = mlp.z * coef2.z + res.z * coef1.z;
-        mlp.w = mlp.w * coef2.w + res.w * coef1.w;
-        mlp_out_cast[tid] = mlp;
-        tid += blockDim.x;
-    }
-}
-
-__global__ void moe_res_matmul(__half* residual,
-                               __half* coef,
-                               __half* mlp_out,
-                               int seq_len,
-                               int hidden_dim)
-{
-    unsigned tid = threadIdx.x;
-
-    float2* residual_cast = reinterpret_cast<float2*>(residual);
-    float2* mlp_out_cast = reinterpret_cast<float2*>(mlp_out);
-    float2* coef_cast = reinterpret_cast<float2*>(coef);
-    float2* coef_cast2 = coef_cast + hidden_dim;
-
-    residual_cast += blockIdx.x * hidden_dim;
-    mlp_out_cast += blockIdx.x * hidden_dim;
-
-    while (tid < hidden_dim) {
-        float2 res = residual_cast[tid];
-        float2 coef1 = coef_cast[tid];
-        float2 coef2 = coef_cast[tid];
-        float2 data = mlp_out_cast[tid];
-        __half* data_h = reinterpret_cast<__half*>(&data);
-        __half* coef1_h = reinterpret_cast<__half*>(&coef1);
-        __half* coef2_h = reinterpret_cast<__half*>(&coef2);
-        __half* res_h = reinterpret_cast<__half*>(&res);
-        data_h[0] = res_h[0] * coef1_h[0] + data_h[0] * coef2_h[0];
-        data_h[1] = res_h[1] * coef1_h[1] + data_h[1] * coef2_h[1];
-        data_h[2] = res_h[2] * coef1_h[2] + data_h[2] * coef2_h[2];
-        data_h[3] = res_h[3] * coef1_h[3] + data_h[3] * coef2_h[3];
-
-        mlp_out_cast[tid] = data;
-        tid += blockDim.x;
-    }
-}
-
-template <typename T>
-void launch_moe_res_matmul(T* residual,
-                           T* coef,
-                           T* mlp_out,
-                           int seq_len,
-                           int hidden_dim,
-                           hipStream_t stream)
-{
-    dim3 grid_dim(seq_len);
-    dim3 block_dim(1024);
-   hipLaunchKernelGGL(( moe_res_matmul), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        residual, coef, mlp_out, seq_len, hidden_dim / 4);
-}
-
-template void launch_moe_res_matmul(float* residual,
-                                    float* coef,
-                                    float* mlp_out,
-                                    int seq_len,
-                                    int hidden_dim,
-                                    hipStream_t stream);
-template void launch_moe_res_matmul(__half* residual,
-                                    __half* coef,
-                                    __half* mlp_out,
-                                    int seq_len,
-                                    int hidden_dim,
-                                    hipStream_t stream);
diff --git a/deepspeed/ops/csrc/transformer/inference/csrc/normalize.cu b/deepspeed/ops/csrc/transformer/inference/csrc/normalize.cu
deleted file mode 100644
index 7f3cfc118631145cd30766cdf13d439a23c138c6..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/inference/csrc/normalize.cu
+++ /dev/null
@@ -1,453 +0,0 @@
-#include <limits>
-#include "custom_cuda_layers.h"
-
-#ifndef __HIP_PLATFORM_HCC__
-#include <cuda_profiler_api.h>
-#endif
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-
-#define NORM_REG (MAX_REGISTERS)
-
-namespace cg = cooperative_groups;
-
-__global__ void fused_bias_residual_layer_norm(float* output,
-                                               const float* vals,
-                                               const float* gamma,
-                                               const float* beta,
-                                               float epsilon,
-                                               int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    float inp_reg[NORM_REG];
-
-    int k = 0;
-    float sum = 0;
-    int input_id = id;
-    while (input_id < row_stride) {
-        inp_reg[k] = vals[input_id + row * row_stride];
-        sum += inp_reg[k++];
-        input_id += iteration_stride;
-    }
-
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-
-    __shared__ float shr[MAX_WARP_NUM];
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-
-    float mean = sum / (row_stride);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        inp_reg[f] -= mean;
-        sum += inp_reg[f] * inp_reg[f];
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * sum;
-        inp_reg[f] = inp_reg[f] * gamma[out_id] + beta[out_id];
-        output[out_id + row * row_stride] = inp_reg[f];
-    }
-}
-
-__global__ void fused_bias_residual_layer_norm(__half* output,
-                                               const __half* vals,
-                                               const __half* gamma,
-                                               const __half* beta,
-                                               float epsilon,
-                                               int row_stride)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    __half2 inp_reg[NORM_REG];
-
-    const __half2* vals_cast = reinterpret_cast<const __half2*>(vals);
-    __half2* out_cast = reinterpret_cast<__half2*>(output);
-
-    int k = 0;
-    int input_id = id;
-    while (input_id < row_stride) {
-        inp_reg[k++] = vals_cast[input_id + row * row_stride];
-        input_id += iteration_stride;
-    }
-    float sum = 0;
-    for (int f = k - 1; f >= 0; f--) {
-        float2 inp_f = __half22float2(inp_reg[f]);
-        sum += inp_f.x + inp_f.y;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    __shared__ float shr[MAX_WARP_NUM];
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride << 1);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        float2 inp_f = __half22float2(inp_reg[f]);
-        inp_f.x -= mean;
-        inp_f.y -= mean;
-        inp_reg[f] = __float22half2_rn(inp_f);
-        sum += inp_f.x * inp_f.x;
-        sum += inp_f.y * inp_f.y;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride << 1);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-    __half2 variance_h = __float2half2_rn(sum);
-    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
-    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * variance_h;
-        inp_reg[f] = inp_reg[f] * gamma_cast[out_id] + beta_cast[out_id];
-        out_cast[out_id + row * row_stride] = inp_reg[f];
-    }
-#endif
-}
-
-template <typename T>
-void launch_layer_norm(T* out,
-                       T* vals,
-                       const T* gamma,
-                       const T* beta,
-                       float epsilon,
-                       int batch_size,
-                       int hidden_dim,
-                       cudaStream_t stream);
-
-template <>
-void launch_layer_norm<float>(float* out,
-                              float* vals,
-                              const float* gamma,
-                              const float* beta,
-                              float epsilon,
-                              int batch_size,
-                              int hidden_dim,
-                              cudaStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-
-    dim3 block_dim(threads);
-
-    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
-        out, vals, gamma, beta, epsilon, hidden_dim);
-}
-
-template <>
-void launch_layer_norm<__half>(__half* out,
-                               __half* vals,
-                               const __half* gamma,
-                               const __half* beta,
-                               float epsilon,
-                               int batch_size,
-                               int hidden_dim,
-                               cudaStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-    dim3 block_dim(threads);
-
-    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
-        out, vals, gamma, beta, epsilon, hidden_dim / 2);
-}
-
-__global__ void fused_residual_layer_norm(float* norm,
-                                          float* res_add,
-                                          float* vals,
-                                          float* residual,
-                                          const float* bias,
-                                          const float* gamma,
-                                          const float* beta,
-                                          float epsilon,
-                                          int row_stride,
-                                          bool preLN,
-                                          bool mlp_after_attn)
-{
-    int iteration_stride = blockDim.x;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    float inp_reg[NORM_REG];
-
-    int k = 0;
-    int input_id = id;
-
-    float sum = 0;
-    while (input_id < row_stride) {
-        inp_reg[k] = vals[input_id + row * row_stride];
-        float res_f = (residual[input_id + row * row_stride]);
-        float bias_f = (bias[input_id]);
-        if (mlp_after_attn) inp_reg[k] += res_f + bias_f;
-        // if (preLN) res_add[input_id + row * row_stride] = inp_reg[k];
-        sum += inp_reg[k++];
-        input_id += iteration_stride;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-
-    __shared__ float shr[MAX_WARP_NUM];
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        inp_reg[f] -= mean;
-        sum += inp_reg[f] * inp_reg[f];
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * sum;
-        inp_reg[f] = inp_reg[f] * gamma[out_id] + beta[out_id];
-        norm[out_id + row * row_stride] = inp_reg[f];
-    }
-}
-
-__global__ void fused_residual_layer_norm(__half* norm,
-                                          __half* res_add,
-                                          __half* vals,
-                                          __half* residual,
-                                          const __half* bias,
-                                          const __half* gamma,
-                                          const __half* beta,
-                                          float epsilon,
-                                          int row_stride,
-                                          bool preLN,
-                                          bool mlp_after_attn)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int iteration_stride = blockDim.x;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    __half2 inp_reg[NORM_REG];
-
-    __half2* vals_cast = reinterpret_cast<__half2*>(vals);
-    __half2* norm_cast = reinterpret_cast<__half2*>(norm);
-    __half2* res_add_cast = reinterpret_cast<__half2*>(res_add);
-    __half2* residual_cast = reinterpret_cast<__half2*>(residual);
-    const __half2* bias_cast = reinterpret_cast<const __half2*>(bias);
-
-    int k = 0;
-    int input_id = id;
-
-    float sum = 0;
-    while (input_id < row_stride) {
-        inp_reg[k] = vals_cast[input_id + row * row_stride];
-        float2 inp_f = __half22float2(inp_reg[k]);
-        float2 res_f = __half22float2(residual_cast[input_id + row * row_stride]);
-        float2 bias_f = __half22float2(bias_cast[input_id]);
-        if (mlp_after_attn) {
-            inp_f.x += res_f.x + bias_f.x;
-            inp_f.y += res_f.y + bias_f.y;
-        }
-        inp_reg[k] = __float22half2_rn(inp_f);
-        // if (preLN) res_add_cast[input_id + row * row_stride] = __float22half2_rn(res_f);
-        // //inp_reg[k];
-        sum += inp_f.x + inp_f.y;
-        input_id += iteration_stride;
-        k++;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    __shared__ float shr[MAX_WARP_NUM];
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride << 1);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        float2 inp_f = __half22float2(inp_reg[f]);
-        inp_f.x -= mean;
-        inp_f.y -= mean;
-        inp_reg[f] = __float22half2_rn(inp_f);
-        sum += inp_f.x * inp_f.x;
-        sum += inp_f.y * inp_f.y;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride << 1);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-    __half2 variance_h = __float2half2_rn(sum);
-    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
-    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * variance_h;
-        inp_reg[f] = inp_reg[f] * gamma_cast[out_id] + beta_cast[out_id];
-        norm_cast[out_id + row * row_stride] = inp_reg[f];
-    }
-#endif
-}
-
-template <typename T>
-void launch_residual_layer_norm(T* norm,
-                                T* res_add,
-                                T* vals,
-                                T* residual,
-                                const T* bias,
-                                const T* gamma,
-                                const T* beta,
-                                float epsilon,
-                                int batch_size,
-                                int hidden_dim,
-                                bool preLN,
-                                bool mlp_after_attn,
-                                cudaStream_t stream);
-
-template <>
-void launch_residual_layer_norm<float>(float* norm,
-                                       float* res_add,
-                                       float* vals,
-                                       float* residual,
-                                       const float* bias,
-                                       const float* gamma,
-                                       const float* beta,
-                                       float epsilon,
-                                       int batch_size,
-                                       int hidden_dim,
-                                       bool preLN,
-                                       bool mlp_after_attn,
-                                       cudaStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-
-    dim3 block_dim(threads);
-
-    fused_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(norm,
-                                                                  res_add,
-                                                                  vals,
-                                                                  residual,
-                                                                  bias,
-                                                                  gamma,
-                                                                  beta,
-                                                                  epsilon,
-                                                                  hidden_dim,
-                                                                  preLN,
-                                                                  mlp_after_attn);
-}
-
-template <>
-void launch_residual_layer_norm<__half>(__half* norm,
-                                        __half* res_add,
-                                        __half* vals,
-                                        __half* residual,
-                                        const __half* bias,
-                                        const __half* gamma,
-                                        const __half* beta,
-                                        float epsilon,
-                                        int batch_size,
-                                        int hidden_dim,
-                                        bool preLN,
-                                        bool mlp_after_attn,
-                                        cudaStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-    dim3 block_dim(threads);
-
-    fused_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(norm,
-                                                                  res_add,
-                                                                  vals,
-                                                                  residual,
-                                                                  bias,
-                                                                  gamma,
-                                                                  beta,
-                                                                  epsilon,
-                                                                  hidden_dim / 2,
-                                                                  preLN,
-                                                                  mlp_after_attn);
-}
diff --git a/deepspeed/ops/csrc/transformer/inference/csrc/normalize.hip b/deepspeed/ops/csrc/transformer/inference/csrc/normalize.hip
deleted file mode 100644
index 333e91f7c046a2e7ca3e2843f045cede327cae49..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/inference/csrc/normalize.hip
+++ /dev/null
@@ -1,455 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include <limits>
-#include "custom_hip_layers.h"
-
-#ifndef __HIP_PLATFORM_HCC__
-#include <cuda_profiler_api.h>
-#endif
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-
-#define NORM_REG (MAX_REGISTERS)
-
-namespace cg = cooperative_groups;
-
-__global__ void fused_bias_residual_layer_norm(float* output,
-                                               const float* vals,
-                                               const float* gamma,
-                                               const float* beta,
-                                               float epsilon,
-                                               int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    float inp_reg[NORM_REG];
-
-    int k = 0;
-    float sum = 0;
-    int input_id = id;
-    while (input_id < row_stride) {
-        inp_reg[k] = vals[input_id + row * row_stride];
-        sum += inp_reg[k++];
-        input_id += iteration_stride;
-    }
-
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-
-    __shared__ float shr[MAX_WARP_NUM];
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-
-    float mean = sum / (row_stride);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        inp_reg[f] -= mean;
-        sum += inp_reg[f] * inp_reg[f];
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * sum;
-        inp_reg[f] = inp_reg[f] * gamma[out_id] + beta[out_id];
-        output[out_id + row * row_stride] = inp_reg[f];
-    }
-}
-
-__global__ void fused_bias_residual_layer_norm(__half* output,
-                                               const __half* vals,
-                                               const __half* gamma,
-                                               const __half* beta,
-                                               float epsilon,
-                                               int row_stride)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    __half2 inp_reg[NORM_REG];
-
-    const __half2* vals_cast = reinterpret_cast<const __half2*>(vals);
-    __half2* out_cast = reinterpret_cast<__half2*>(output);
-
-    int k = 0;
-    int input_id = id;
-    while (input_id < row_stride) {
-        inp_reg[k++] = vals_cast[input_id + row * row_stride];
-        input_id += iteration_stride;
-    }
-    float sum = 0;
-    for (int f = k - 1; f >= 0; f--) {
-        float2 inp_f = __half22float2(inp_reg[f]);
-        sum += inp_f.x + inp_f.y;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    __shared__ float shr[MAX_WARP_NUM];
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride << 1);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        float2 inp_f = __half22float2(inp_reg[f]);
-        inp_f.x -= mean;
-        inp_f.y -= mean;
-        inp_reg[f] = __float22half2_rn(inp_f);
-        sum += inp_f.x * inp_f.x;
-        sum += inp_f.y * inp_f.y;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride << 1);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-    __half2 variance_h = __float2half2_rn(sum);
-    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
-    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * variance_h;
-        inp_reg[f] = inp_reg[f] * gamma_cast[out_id] + beta_cast[out_id];
-        out_cast[out_id + row * row_stride] = inp_reg[f];
-    }
-#endif
-}
-
-template <typename T>
-void launch_layer_norm(T* out,
-                       T* vals,
-                       const T* gamma,
-                       const T* beta,
-                       float epsilon,
-                       int batch_size,
-                       int hidden_dim,
-                       hipStream_t stream);
-
-template <>
-void launch_layer_norm<float>(float* out,
-                              float* vals,
-                              const float* gamma,
-                              const float* beta,
-                              float epsilon,
-                              int batch_size,
-                              int hidden_dim,
-                              hipStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-
-    dim3 block_dim(threads);
-
-   hipLaunchKernelGGL(( fused_bias_residual_layer_norm), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        out, vals, gamma, beta, epsilon, hidden_dim);
-}
-
-template <>
-void launch_layer_norm<__half>(__half* out,
-                               __half* vals,
-                               const __half* gamma,
-                               const __half* beta,
-                               float epsilon,
-                               int batch_size,
-                               int hidden_dim,
-                               hipStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-    dim3 block_dim(threads);
-
-   hipLaunchKernelGGL(( fused_bias_residual_layer_norm), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        out, vals, gamma, beta, epsilon, hidden_dim / 2);
-}
-
-__global__ void fused_residual_layer_norm(float* norm,
-                                          float* res_add,
-                                          float* vals,
-                                          float* residual,
-                                          const float* bias,
-                                          const float* gamma,
-                                          const float* beta,
-                                          float epsilon,
-                                          int row_stride,
-                                          bool preLN,
-                                          bool mlp_after_attn)
-{
-    int iteration_stride = blockDim.x;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    float inp_reg[NORM_REG];
-
-    int k = 0;
-    int input_id = id;
-
-    float sum = 0;
-    while (input_id < row_stride) {
-        inp_reg[k] = vals[input_id + row * row_stride];
-        float res_f = (residual[input_id + row * row_stride]);
-        float bias_f = (bias[input_id]);
-        if (mlp_after_attn) inp_reg[k] += res_f + bias_f;
-        // if (preLN) res_add[input_id + row * row_stride] = inp_reg[k];
-        sum += inp_reg[k++];
-        input_id += iteration_stride;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-
-    __shared__ float shr[MAX_WARP_NUM];
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        inp_reg[f] -= mean;
-        sum += inp_reg[f] * inp_reg[f];
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * sum;
-        inp_reg[f] = inp_reg[f] * gamma[out_id] + beta[out_id];
-        norm[out_id + row * row_stride] = inp_reg[f];
-    }
-}
-
-__global__ void fused_residual_layer_norm(__half* norm,
-                                          __half* res_add,
-                                          __half* vals,
-                                          __half* residual,
-                                          const __half* bias,
-                                          const __half* gamma,
-                                          const __half* beta,
-                                          float epsilon,
-                                          int row_stride,
-                                          bool preLN,
-                                          bool mlp_after_attn)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int iteration_stride = blockDim.x;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    __half2 inp_reg[NORM_REG];
-
-    __half2* vals_cast = reinterpret_cast<__half2*>(vals);
-    __half2* norm_cast = reinterpret_cast<__half2*>(norm);
-    __half2* res_add_cast = reinterpret_cast<__half2*>(res_add);
-    __half2* residual_cast = reinterpret_cast<__half2*>(residual);
-    const __half2* bias_cast = reinterpret_cast<const __half2*>(bias);
-
-    int k = 0;
-    int input_id = id;
-
-    float sum = 0;
-    while (input_id < row_stride) {
-        inp_reg[k] = vals_cast[input_id + row * row_stride];
-        float2 inp_f = __half22float2(inp_reg[k]);
-        float2 res_f = __half22float2(residual_cast[input_id + row * row_stride]);
-        float2 bias_f = __half22float2(bias_cast[input_id]);
-        if (mlp_after_attn) {
-            inp_f.x += res_f.x + bias_f.x;
-            inp_f.y += res_f.y + bias_f.y;
-        }
-        inp_reg[k] = __float22half2_rn(inp_f);
-        // if (preLN) res_add_cast[input_id + row * row_stride] = __float22half2_rn(res_f);
-        // //inp_reg[k];
-        sum += inp_f.x + inp_f.y;
-        input_id += iteration_stride;
-        k++;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    __shared__ float shr[MAX_WARP_NUM];
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride << 1);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        float2 inp_f = __half22float2(inp_reg[f]);
-        inp_f.x -= mean;
-        inp_f.y -= mean;
-        inp_reg[f] = __float22half2_rn(inp_f);
-        sum += inp_f.x * inp_f.x;
-        sum += inp_f.y * inp_f.y;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride << 1);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-    __half2 variance_h = __float2half2_rn(sum);
-    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
-    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * variance_h;
-        inp_reg[f] = inp_reg[f] * gamma_cast[out_id] + beta_cast[out_id];
-        norm_cast[out_id + row * row_stride] = inp_reg[f];
-    }
-#endif
-}
-
-template <typename T>
-void launch_residual_layer_norm(T* norm,
-                                T* res_add,
-                                T* vals,
-                                T* residual,
-                                const T* bias,
-                                const T* gamma,
-                                const T* beta,
-                                float epsilon,
-                                int batch_size,
-                                int hidden_dim,
-                                bool preLN,
-                                bool mlp_after_attn,
-                                hipStream_t stream);
-
-template <>
-void launch_residual_layer_norm<float>(float* norm,
-                                       float* res_add,
-                                       float* vals,
-                                       float* residual,
-                                       const float* bias,
-                                       const float* gamma,
-                                       const float* beta,
-                                       float epsilon,
-                                       int batch_size,
-                                       int hidden_dim,
-                                       bool preLN,
-                                       bool mlp_after_attn,
-                                       hipStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-
-    dim3 block_dim(threads);
-
-   hipLaunchKernelGGL(( fused_residual_layer_norm), dim3(grid_dim), dim3(block_dim), 0, stream, norm,
-                                                                  res_add,
-                                                                  vals,
-                                                                  residual,
-                                                                  bias,
-                                                                  gamma,
-                                                                  beta,
-                                                                  epsilon,
-                                                                  hidden_dim,
-                                                                  preLN,
-                                                                  mlp_after_attn);
-}
-
-template <>
-void launch_residual_layer_norm<__half>(__half* norm,
-                                        __half* res_add,
-                                        __half* vals,
-                                        __half* residual,
-                                        const __half* bias,
-                                        const __half* gamma,
-                                        const __half* beta,
-                                        float epsilon,
-                                        int batch_size,
-                                        int hidden_dim,
-                                        bool preLN,
-                                        bool mlp_after_attn,
-                                        hipStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-    dim3 block_dim(threads);
-
-   hipLaunchKernelGGL(( fused_residual_layer_norm), dim3(grid_dim), dim3(block_dim), 0, stream, norm,
-                                                                  res_add,
-                                                                  vals,
-                                                                  residual,
-                                                                  bias,
-                                                                  gamma,
-                                                                  beta,
-                                                                  epsilon,
-                                                                  hidden_dim / 2,
-                                                                  preLN,
-                                                                  mlp_after_attn);
-}
diff --git a/deepspeed/ops/csrc/transformer/inference/csrc/pt_binding.cpp b/deepspeed/ops/csrc/transformer/inference/csrc/pt_binding.cpp
deleted file mode 100644
index 320e6491b1cd1cb87749e8c8cb8624871b1cc904..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/inference/csrc/pt_binding.cpp
+++ /dev/null
@@ -1,951 +0,0 @@
-
-#include <ATen/cuda/CUDAContext.h>
-#include <torch/extension.h>
-#include <vector>
-#include "context.h"
-#include "cublas_wrappers.h"
-#include "custom_cuda_layers.h"
-
-std::array<int, 3> gemm_algos = std::array<int, 3>({99, 99, 99});
-
-#define MAX_OUT_TOKES 10
-
-template <typename T>
-at::Tensor ds_softmax(at::Tensor& attn_scores,
-                      at::Tensor& attn_mask,
-                      bool triangular,
-                      bool recompute,
-                      bool local_attention,
-                      int window_size,
-                      bool async_op)
-{
-    auto attn_scores_c = attn_scores.contiguous();
-    int bsz = attn_scores_c.size(0);
-
-    int seq_len = attn_scores_c.size(1);
-    int len = attn_scores_c.sizes().size();
-    if (len > 3) seq_len = attn_scores_c.size(2);
-
-    int soft_len = attn_scores_c.size(2);
-    if (len > 3) soft_len = attn_scores_c.size(3);
-
-    int heads = 1;
-    if (len > 3) heads = attn_scores_c.size(1);
-
-    launch_attn_softmax_v2((T*)attn_scores_c.data_ptr(),
-                           (attn_mask.sizes().size() > 1 ? (T*)attn_mask.data_ptr() : nullptr),
-                           triangular,
-                           recompute,
-                           local_attention,
-                           window_size,
-                           bsz,
-                           heads,
-                           seq_len,
-                           soft_len,
-                           1.0,
-                           Context::Instance().GetCurrentStream(async_op));
-
-    return attn_scores_c;
-}
-
-template <typename T>
-void allocate_workspace(size_t hidden_dim,
-                        size_t max_seq_len,
-                        size_t batch_size,
-                        size_t head_size = 128)
-{
-    size_t _workSpaceSize = (hidden_dim * batch_size * max_seq_len);
-    Context::Instance().GenWorkSpace(_workSpaceSize * sizeof(T));
-}
-
-template <typename T>
-at::Tensor einsum_sec_sm_ecm(at::Tensor& Q, at::Tensor& W)
-{
-    auto options = at::TensorOptions()
-                       .dtype(Q.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-    T* workspace = (T*)Context::Instance().GetWorkSpace();
-    float alpha = 1;
-    float gemm_beta = 0.0;
-
-    if (!workspace) {
-        allocate_workspace<T>(W.size(1), MAX_OUT_TOKES, Q.size(0));
-        workspace = (T*)Context::Instance().GetWorkSpace();
-    }
-
-    auto O = at::from_blob(workspace, {Q.size(1), Q.size(2), W.size(1)}, options);
-    unsigned m = W.size(1);
-    unsigned n = Q.size(1) * Q.size(2);
-    unsigned k = Q.size(0);
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_T,
-                   m,
-                   n,
-                   k,
-                   &alpha,
-                   &gemm_beta,
-                   (T*)W.data_ptr(),
-                   (T*)Q.data_ptr(),
-                   (T*)O.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
-#else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-    return O;
-}
-
-template <typename T>
-void attention_unfused(at::Tensor& prev_key_cont,
-                       at::Tensor& query_cont,
-                       at::Tensor& attn_mask,
-                       at::Tensor& prev_value_cont,
-                       at::Tensor& output,
-                       int& bsz,
-                       int& seq_len,
-                       int& soft_len,
-                       int& heads,
-                       float& norm_factor,
-                       bool triangular,
-                       bool recompute,
-                       bool local_attention,
-                       int window_size)
-{
-    auto options = at::TensorOptions()
-                       .dtype(query_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-    float alpha = norm_factor;
-    float gemm_beta = 0.0;
-    auto attn_score = at::empty({bsz, heads, seq_len, soft_len}, options);
-    int k = prev_value_cont.size(2) / heads;
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
-                                soft_len,
-                                seq_len,
-                                k,
-                                &alpha,
-                                &gemm_beta,
-                                (T*)prev_key_cont.data_ptr(),
-                                (T*)query_cont.data_ptr(),
-                                (T*)attn_score.data_ptr(),
-                                CUBLAS_OP_N,
-                                CUBLAS_OP_N,
-                                soft_len * k,
-                                seq_len * k,
-                                seq_len * soft_len,
-                                bsz * heads,
-#ifdef __HIP_PLATFORM_HCC__
-                                rocblas_gemm_algo_standard);
-#else
-                                CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-    attn_score = ds_softmax<T>(
-        attn_score, attn_mask, triangular, recompute, local_attention, window_size, false);
-    alpha = 1.0;
-    cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
-                                k,
-                                seq_len,
-                                soft_len,
-                                &alpha,
-                                &gemm_beta,
-                                (T*)prev_value_cont.data_ptr(),
-                                (T*)attn_score.data_ptr(),
-                                (T*)output.data_ptr(),
-                                CUBLAS_OP_N,
-                                CUBLAS_OP_N,
-                                soft_len * k,
-                                seq_len * soft_len,
-                                seq_len * k,
-                                bsz * heads,
-#ifdef __HIP_PLATFORM_HCC__
-                                rocblas_gemm_algo_standard);
-#else
-                                CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-}
-
-template <typename T>
-std::vector<at::Tensor> ds_softmax_context(at::Tensor& query,
-                                           at::Tensor& prev_key,
-                                           at::Tensor& new_key,
-                                           at::Tensor& attn_mask,
-                                           at::Tensor& prev_value,
-                                           at::Tensor& new_value,
-                                           int heads,
-                                           float norm_factor,
-                                           bool merging,
-                                           bool triangular,
-                                           bool local_attention,
-                                           int window_size,
-                                           bool no_masking)
-{
-    auto query_cont = query.contiguous();
-    auto prev_key_cont = prev_key.contiguous();
-    auto prev_value_cont = prev_value.contiguous();
-
-    int new_size = (new_value.sizes().size() > 1 ? new_value.size(1) : 0);
-
-    // Attn_Score [ batch Head Sequence-length Softmax-length]
-
-    int bsz = query_cont.size(0);
-    int seq_len = query_cont.size(1);
-    int soft_len = prev_value.size(1);
-
-    auto options = at::TensorOptions()
-                       .dtype(query_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output =
-        at::empty({prev_value.size(0), heads, seq_len, prev_value.size(2) / heads}, options);
-    attention_unfused<T>(prev_key_cont,
-                         query_cont,
-                         attn_mask,  //(no_masking ? nullptr : (T*)attn_mask.data_ptr()),
-                         prev_value_cont,
-                         output,
-                         bsz,
-                         seq_len,
-                         soft_len,
-                         heads,
-                         norm_factor,
-                         (triangular && (new_size == 0)),
-                         (new_size == 0),
-                         local_attention,
-                         window_size);
-
-    return {output, prev_key, prev_value};
-}
-
-template <typename T>
-at::Tensor ds_bias_gelu(at::Tensor& input, at::Tensor& bias)
-{
-    auto input_cont = input.contiguous();
-
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    int intermediate_size = input_cont.size(2);
-
-    launch_bias_gelu((T*)input_cont.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     intermediate_size,
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-    return input_cont;
-}
-
-template <typename T>
-at::Tensor ds_bias_residual(at::Tensor& input, at::Tensor& residual, at::Tensor& bias)
-{
-    auto input_cont = input.contiguous();
-    auto residual_cont = residual.contiguous();
-
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    // launch_bias_residual((T*)input_cont.data_ptr(),
-    //                      (T*)residual_cont.data_ptr(),
-    //                      (T*)bias.data_ptr(),
-    //                      bsz,
-    //                      input_cont.size(2),
-    //                      (bias.size(0) > 1),
-    //                      Context::Instance().GetCurrentStream());
-    return input_cont;
-}
-
-template <typename T>
-at::Tensor ds_layernorm(at::Tensor& input_cont, at::Tensor& gamma, at::Tensor& betta, float epsilon)
-{
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    auto inp_norm = at::empty_like(input_cont);
-    launch_layer_norm((T*)inp_norm.data_ptr(),
-                      (T*)input_cont.data_ptr(),
-                      (T*)gamma.data_ptr(),
-                      (T*)betta.data_ptr(),
-                      epsilon,
-                      bsz,
-                      input_cont.size(2),
-                      Context::Instance().GetCurrentStream());
-    return inp_norm;
-}
-
-template <typename T>
-at::Tensor qkv_unfused_cublas(at::Tensor& output,
-                              at::Tensor& input,
-                              at::Tensor& weight,
-                              at::Tensor& bias,
-                              at::Tensor& gamma,
-                              at::Tensor& beta,
-                              const float epsilon,
-                              bool add_bias)
-{
-    auto inp_norm = ds_layernorm<T>(input, gamma, beta, epsilon);
-
-    // cudaEventRecord(Context::Instance().GetCompEvent(1), Context::Instance().GetCurrentStream());
-
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    int bsz = input.size(0) * input.size(1);
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   weight.size(1),
-                   bsz,
-                   input.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)inp_norm.data_ptr(),
-                   (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
-#else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-    if (add_bias)
-        launch_bias_add((T*)output.data_ptr(),
-                        (T*)bias.data_ptr(),
-                        weight.size(1),
-                        bsz,
-                        Context::Instance().GetCurrentStream());
-    return inp_norm;
-}
-
-template <typename T>
-std::vector<at::Tensor> ds_qkv_gemm(at::Tensor& input,
-                                    at::Tensor& weight,
-                                    at::Tensor& bias,
-                                    at::Tensor& gamma,
-                                    at::Tensor& beta,
-                                    const float epsilon,
-                                    bool add_bias)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    auto inp_norm =
-        qkv_unfused_cublas<T>(output, input_cont, weight, bias, gamma, beta, epsilon, add_bias);
-
-    return {output, inp_norm};
-}
-
-template <typename T>
-void quantized_gemm(at::Tensor& output,
-                    at::Tensor& input,
-                    at::Tensor& weight,
-                    at::Tensor& qscale,
-                    int groups,
-                    int merge_count)
-{
-    int bsz = input.size(0) * input.size(1);
-    auto options = at::TensorOptions()
-                       .dtype(input.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-    auto weight16 = at::empty({weight.size(0), weight.size(1)}, options);
-
-    launch_dequantize((T*)weight16.data_ptr(),
-                      (int8_t*)weight.data_ptr(),
-                      (float*)qscale.data_ptr(),
-                      weight.size(1),
-                      weight.size(0),
-                      groups,
-                      merge_count,
-                      Context::Instance().GetCurrentStream());
-
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   weight.size(1),
-                   bsz,
-                   input.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight16.data_ptr(),
-                   (T*)input.data_ptr(),
-                   (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
-#else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-}
-
-template <typename T>
-at::Tensor ds_qkv_gemm_int8(at::Tensor& input,
-                            at::Tensor& weight,
-                            at::Tensor& bias,
-                            at::Tensor& gamma,
-                            at::Tensor& beta,
-                            const float epsilon,
-                            at::Tensor& q_scale,
-                            int groups,
-                            bool add_bias)
-{
-    int bsz = input.size(0) * input.size(1);
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    auto inp_norm = ds_layernorm<T>(input_cont, gamma, beta, epsilon);
-
-    quantized_gemm<T>(output, inp_norm, weight, q_scale, groups, 0);
-    if (add_bias)
-        launch_bias_add((T*)output.data_ptr(),
-                        (T*)bias.data_ptr(),
-                        weight.size(1),
-                        bsz,
-                        Context::Instance().GetCurrentStream());
-
-    return output;
-}
-
-template <typename T>
-at::Tensor ds_linear_layer(at::Tensor& input, at::Tensor& weight, at::Tensor& bias)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   weight.size(1),
-                   bsz,
-                   input_cont.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)input_cont.data_ptr(),
-                   (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
-#else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-
-    launch_bias_add((T*)output.data_ptr(),
-                    (T*)bias.data_ptr(),
-                    weight.size(1),
-                    bsz,
-                    Context::Instance().GetCurrentStream());
-
-    return output;
-}
-
-template <typename T>
-at::Tensor ds_linear_layer_int8(at::Tensor& input,
-                                at::Tensor& weight,
-                                at::Tensor& bias,
-                                at::Tensor& q_scale,
-                                int groups)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    quantized_gemm<T>(output, input_cont, weight, q_scale, groups, 0);
-    launch_bias_add((T*)output.data_ptr(),
-                    (T*)bias.data_ptr(),
-                    weight.size(1),
-                    bsz,
-                    Context::Instance().GetCurrentStream());
-    return output;
-}
-
-template <typename T>
-at::Tensor ds_vector_matmul(at::Tensor& input, at::Tensor& weight, bool async_op)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    cublasSetStream(Context::Instance().GetCublasHandle(),
-                    Context::Instance().GetCurrentStream(async_op));
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   weight.size(1),
-                   bsz,
-                   input_cont.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)input_cont.data_ptr(),
-                   (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
-#else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-    return output;
-}
-
-template <typename T>
-at::Tensor ds_vector_matmul_int8(at::Tensor& input,
-                                 at::Tensor& weight,
-                                 at::Tensor& q_scale,
-                                 int groups,
-                                 int merge_count)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    quantized_gemm<T>(output, input_cont, weight, q_scale, groups, merge_count);
-    return output;
-}
-
-template <typename T>
-void mlp_unfused_cublas(at::Tensor& output,
-                        at::Tensor& input,
-                        at::Tensor& residual,
-                        at::Tensor& input_bias,
-                        at::Tensor& weight,
-                        at::Tensor& bias,
-                        at::Tensor& gamma,
-                        at::Tensor& beta,
-                        const float epsilon,
-                        bool preLayerNorm,
-                        bool mlp_after_attn)
-{
-    int bsz = input.size(0) * input.size(1);
-    auto inp_norm = at::empty_like(input);
-
-    launch_residual_layer_norm((T*)inp_norm.data_ptr(),
-                               (T*)nullptr,
-                               (T*)input.data_ptr(),
-                               (T*)residual.data_ptr(),
-                               (T*)input_bias.data_ptr(),
-                               (T*)gamma.data_ptr(),
-                               (T*)beta.data_ptr(),
-                               epsilon,
-                               bsz,
-                               input.size(2),
-                               preLayerNorm,
-                               mlp_after_attn,
-                               Context::Instance().GetCurrentStream());
-
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   weight.size(1),
-                   bsz,
-                   input.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)inp_norm.data_ptr(),
-                   (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
-#else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-    launch_bias_gelu((T*)output.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     weight.size(1),
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-}
-template <typename T>
-at::Tensor ds_mlp_gemm(at::Tensor& input,
-                       at::Tensor& residual,
-                       at::Tensor& input_bias,
-                       at::Tensor& weight,
-                       at::Tensor& bias,
-                       at::Tensor& gamma,
-                       at::Tensor& beta,
-                       const float epsilon,
-                       bool preLayerNorm,
-                       bool mlp_after_attn)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-
-    mlp_unfused_cublas<T>(output,
-                          mlp_after_attn ? input : residual,
-                          residual,
-                          input_bias,
-                          weight,
-                          bias,
-                          gamma,
-                          beta,
-                          epsilon,
-                          preLayerNorm,
-                          mlp_after_attn);
-
-    return output;
-}
-
-template <typename T>
-std::vector<at::Tensor> ds_mlp_gemm_int8(at::Tensor& input,
-                                         at::Tensor& residual,
-                                         at::Tensor& input_bias,
-                                         at::Tensor& weight,
-                                         at::Tensor& bias,
-                                         at::Tensor& gamma,
-                                         at::Tensor& beta,
-                                         const float epsilon,
-                                         at::Tensor& q_scale,
-                                         int groups,
-                                         bool preLayerNorm)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    auto inp_norm = at::empty_like(input_cont);
-
-    auto residual_add = (preLayerNorm ? at::empty_like(input_cont) : inp_norm);
-    // computing the blocking across K dimension
-    // launch_residual_layer_norm((T*)inp_norm.data_ptr(),
-    //                           (T*)residual_add.data_ptr(),
-    //                           (T*)input_cont.data_ptr(),
-    //                           (T*)residual.data_ptr(),
-    //                           (T*)input_bias.data_ptr(),
-    //                           (T*)gamma.data_ptr(),
-    //                           (T*)beta.data_ptr(),
-    //                           epsilon,
-    //                           bsz,
-    //                           input_cont.size(2),
-    //                           preLayerNorm,
-    //                           Context::Instance().GetCurrentStream());
-
-    quantized_gemm<T>(output, inp_norm, weight, q_scale, groups, 0);
-    launch_bias_gelu((T*)output.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     weight.size(1),
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-
-    return {output, residual_add};
-}
-
-template <typename T>
-at::Tensor fused_gemm_gelu(at::Tensor& input,
-                           at::Tensor& weight,
-                           at::Tensor& bias,
-                           at::Tensor& weight_out,
-                           const float epsilon,
-                           bool preLayerNorm,
-                           bool async_op)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto intermediate =
-        at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight_out.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   weight.size(1),
-                   bsz,
-                   input.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)input_cont.data_ptr(),
-                   (T*)intermediate.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
-#else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-    launch_bias_gelu((T*)intermediate.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     weight.size(1),
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   weight_out.size(1),
-                   bsz,
-                   intermediate.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight_out.data_ptr(),
-                   (T*)intermediate.data_ptr(),
-                   (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
-#else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-    // cudaEventRecord(Context::Instance().GetCompEvent(2),
-    //                Context::Instance().GetCurrentStream(true));
-    return output;
-}
-
-void residual_add_bias(at::Tensor& output,
-                       at::Tensor& input,
-                       at::Tensor& attention_output,
-                       at::Tensor& output_b,
-                       at::Tensor& attention_b,
-                       int mp_size,
-                       bool mlp_after_attn)
-{
-    int bsz = input.size(0) * input.size(1);
-    int hidden_size = input.size(2);
-    // cudaStreamWaitEvent(
-    //    Context::Instance().GetCurrentStream(), Context::Instance().GetCompEvent(2), 0);
-    if (input.scalar_type() == at::kFloat)
-        if (mlp_after_attn)
-            launch_bias_residual((float*)input.data_ptr(),
-                                 (float*)output.data_ptr(),
-                                 (float*)attention_output.data_ptr(),
-                                 (float*)output_b.data_ptr(),
-                                 (float*)attention_b.data_ptr(),
-                                 bsz,
-                                 hidden_size,
-                                 mp_size,
-                                 Context::Instance().GetCurrentStream());
-        else
-            launch_gptj_residual_add<float>((float*)input.data_ptr(),
-                                            (float*)output.data_ptr(),
-                                            (float*)attention_output.data_ptr(),
-                                            (float*)output_b.data_ptr(),
-                                            (float*)attention_b.data_ptr(),
-                                            hidden_size,
-                                            bsz,
-                                            mp_size,
-                                            Context::Instance().GetCurrentStream());
-    else if (mlp_after_attn)
-        launch_bias_residual((__half*)input.data_ptr(),
-                             (__half*)output.data_ptr(),
-                             (__half*)attention_output.data_ptr(),
-                             (__half*)output_b.data_ptr(),
-                             (__half*)attention_b.data_ptr(),
-                             bsz,
-                             hidden_size,
-                             mp_size,
-                             Context::Instance().GetCurrentStream());
-    else
-        launch_gptj_residual_add<__half>((__half*)input.data_ptr(),
-                                         (__half*)output.data_ptr(),
-                                         (__half*)attention_output.data_ptr(),
-                                         (__half*)output_b.data_ptr(),
-                                         (__half*)attention_b.data_ptr(),
-                                         hidden_size,
-                                         bsz,
-                                         mp_size,
-                                         Context::Instance().GetCurrentStream());
-}
-
-std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query,
-                                             at::Tensor& key_layer,
-                                             unsigned rotary_dim,
-                                             unsigned offset,
-                                             unsigned num_heads,
-                                             bool rotate_half,
-                                             bool rotate_every_two)
-{
-    auto query_cont = mixed_query.contiguous();
-    auto key_cont = key_layer.contiguous();
-
-    unsigned bsz = mixed_query.size(0);
-    unsigned head_size = mixed_query.size(2) / num_heads;
-    unsigned seq_len = mixed_query.size(1);
-
-    if (mixed_query.scalar_type() == at::kFloat)
-        launch_apply_rotary_pos_emb<float>((float*)query_cont.data_ptr(),
-                                           (float*)key_cont.data_ptr(),
-                                           head_size,
-                                           seq_len,
-                                           rotary_dim,
-                                           offset,
-                                           num_heads,
-                                           bsz,
-                                           rotate_half,
-                                           rotate_every_two,
-                                           Context::Instance().GetCurrentStream());
-    else
-        launch_apply_rotary_pos_emb<__half>((__half*)query_cont.data_ptr(),
-                                            (__half*)key_cont.data_ptr(),
-                                            head_size,
-                                            seq_len,
-                                            rotary_dim,
-                                            offset,
-                                            num_heads,
-                                            bsz,
-                                            rotate_half,
-                                            rotate_every_two,
-                                            Context::Instance().GetCurrentStream());
-    return {query_cont, key_cont};
-}
-
-template <typename T>
-at::Tensor fused_gemm_gelu_int8(at::Tensor& input,
-                                at::Tensor& weight,
-                                at::Tensor& bias,
-                                const float epsilon,
-                                at::Tensor& q_scale,
-                                int groups,
-                                bool preLayerNorm)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    int bsz = input_cont.size(0) * input_cont.size(1);
-
-    quantized_gemm<T>(output, input_cont, weight, q_scale, groups, 0);
-    launch_bias_gelu((T*)output.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     weight.size(1),
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-
-    return output;
-}
-
-at::Tensor moe_res_matmul(at::Tensor& moe_res, at::Tensor& coef, at::Tensor& output)
-{
-    int M = moe_res.size(0) * moe_res.size(1);
-    int N = moe_res.size(2);
-    Context::Instance().SynchComm();
-    if (moe_res.scalar_type() == at::kFloat) {
-        launch_moe_res_matmul<float>((float*)moe_res.data_ptr(),
-                                     (float*)coef.data_ptr(),
-                                     (float*)output.data_ptr(),
-                                     M,
-                                     N,
-                                     at::cuda::getCurrentCUDAStream());
-    } else {
-        launch_moe_res_matmul<__half>((__half*)moe_res.data_ptr(),
-                                      (__half*)coef.data_ptr(),
-                                      (__half*)output.data_ptr(),
-                                      M,
-                                      N,
-                                      at::cuda::getCurrentCUDAStream());
-    }
-    return output;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("softmax_fp32", &ds_softmax<float>, "DeepSpeed SoftMax with fp32 (CUDA)");
-    m.def("softmax_fp16", &ds_softmax<__half>, "DeepSpeed SoftMax with fp32 (CUDA)");
-    m.def(
-        "softmax_context_fp32", &ds_softmax_context<float>, "DeepSpeed attention with fp32 (CUDA)");
-    m.def("softmax_context_fp16",
-          &ds_softmax_context<__half>,
-          "DeepSpeed attention with fp32 (CUDA)");
-    m.def("bias_gelu_fp32", &ds_bias_gelu<float>, "DeepSpeed Gelu with fp32 (CUDA)");
-    m.def("bias_gelu_fp16", &ds_bias_gelu<__half>, "DeepSpeed Gelu with fp32 (CUDA)");
-    m.def("bias_residual_fp32",
-          &ds_bias_residual<float>,
-          "DeepSpeed residual-bias add with fp32 (CUDA)");
-    m.def("bias_residual_fp16",
-          &ds_bias_residual<__half>,
-          "DeepSpeed residual-bias add with fp32 (CUDA)");
-    m.def("layer_norm_fp32", &ds_layernorm<float>, "DeepSpeed layer-norm with fp32 (CUDA)");
-    m.def("layer_norm_fp16", &ds_layernorm<__half>, "DeepSpeed layer-norm with fp16 (CUDA)");
-    m.def("qkv_gemm_fp32", &ds_qkv_gemm<float>, "DeepSpeed qkv gemm with fp32 (CUDA)");
-    m.def("qkv_gemm_fp16", &ds_qkv_gemm<__half>, "DeepSpeed qkv gemm with fp16 (CUDA)");
-    m.def("qkv_gemm_int8", &ds_qkv_gemm_int8<__half>, "DeepSpeed qkv gemm with int8 (CUDA)");
-    m.def("mlp_gemm_fp32", &ds_mlp_gemm<float>, "DeepSpeed mlp with fp32 (CUDA)");
-    m.def("mlp_gemm_fp16", &ds_mlp_gemm<__half>, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("mlp_gemm_int8", &ds_mlp_gemm_int8<__half>, "DeepSpeed mlp with int8 (CUDA)");
-    m.def("vector_matmul_fp32", &ds_vector_matmul<float>, "DeepSpeed vector-MM with fp32 (CUDA)");
-    m.def("vector_matmul_fp16", &ds_vector_matmul<__half>, "DeepSpeed vector-MM with fp16 (CUDA)");
-    m.def("vector_matmul_int8",
-          &ds_vector_matmul_int8<__half>,
-          "DeepSpeed vector-MM with int8 (CUDA)");
-    m.def("linear_layer_fp32", &ds_linear_layer<float>, "DeepSpeed linear_layer with fp32 (CUDA)");
-    m.def("linear_layer_fp16", &ds_linear_layer<__half>, "DeepSpeed linear_layer with fp16 (CUDA)");
-    m.def("linear_layer_int8",
-          &ds_linear_layer_int8<__half>,
-          "DeepSpeed linear_layer with int8 (CUDA)");
-    m.def("fused_gemm_gelu_fp32", &fused_gemm_gelu<float>, "DeepSpeed mlp with fp32 (CUDA)");
-    m.def("fused_gemm_gelu_fp16", &fused_gemm_gelu<__half>, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("residual_add", &residual_add_bias, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("apply_rotary_pos_emb", &apply_rotary_pos_emb, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("einsum_sec_sm_ecm_fp32",
-          &einsum_sec_sm_ecm<float>,
-          "DeepSpeed vector-MM with fp32 (CUDA)");
-
-    m.def("einsum_sec_sm_ecm_fp16",
-          &einsum_sec_sm_ecm<__half>,
-          "DeepSpeed vector-MM with fp16 (CUDA)");
-    m.def("moe_res_matmul", &moe_res_matmul, "DeepSpeed moe residual matmul (CUDA)");
-}
diff --git a/deepspeed/ops/csrc/transformer/inference/csrc/pt_binding_hip.cpp b/deepspeed/ops/csrc/transformer/inference/csrc/pt_binding_hip.cpp
deleted file mode 100644
index 6fed126f2c360dd3eec0ce9831b200acce3cd9d9..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/inference/csrc/pt_binding_hip.cpp
+++ /dev/null
@@ -1,952 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-
-#include <ATen/hip/HIPContext.h>
-#include <torch/extension.h>
-#include <vector>
-#include "context_hip.h"
-#include "cublas_wrappers_hip.h"
-#include "custom_hip_layers.h"
-
-std::array<int, 3> gemm_algos = std::array<int, 3>({99, 99, 99});
-
-#define MAX_OUT_TOKES 10
-
-template <typename T>
-at::Tensor ds_softmax(at::Tensor& attn_scores,
-                      at::Tensor& attn_mask,
-                      bool triangular,
-                      bool recompute,
-                      bool local_attention,
-                      int window_size,
-                      bool async_op)
-{
-    auto attn_scores_c = attn_scores.contiguous();
-    int bsz = attn_scores_c.size(0);
-
-    int seq_len = attn_scores_c.size(1);
-    int len = attn_scores_c.sizes().size();
-    if (len > 3) seq_len = attn_scores_c.size(2);
-
-    int soft_len = attn_scores_c.size(2);
-    if (len > 3) soft_len = attn_scores_c.size(3);
-
-    int heads = 1;
-    if (len > 3) heads = attn_scores_c.size(1);
-
-    launch_attn_softmax_v2((T*)attn_scores_c.data_ptr(),
-                           (attn_mask.sizes().size() > 1 ? (T*)attn_mask.data_ptr() : nullptr),
-                           triangular,
-                           recompute,
-                           local_attention,
-                           window_size,
-                           bsz,
-                           heads,
-                           seq_len,
-                           soft_len,
-                           1.0,
-                           Context::Instance().GetCurrentStream(async_op));
-
-    return attn_scores_c;
-}
-
-template <typename T>
-void allocate_workspace(size_t hidden_dim,
-                        size_t max_seq_len,
-                        size_t batch_size,
-                        size_t head_size = 128)
-{
-    size_t _workSpaceSize = (hidden_dim * batch_size * max_seq_len);
-    Context::Instance().GenWorkSpace(_workSpaceSize * sizeof(T));
-}
-
-template <typename T>
-at::Tensor einsum_sec_sm_ecm(at::Tensor& Q, at::Tensor& W)
-{
-    auto options = at::TensorOptions()
-                       .dtype(Q.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-    T* workspace = (T*)Context::Instance().GetWorkSpace();
-    float alpha = 1;
-    float gemm_beta = 0.0;
-
-    if (!workspace) {
-        allocate_workspace<T>(W.size(1), MAX_OUT_TOKES, Q.size(0));
-        workspace = (T*)Context::Instance().GetWorkSpace();
-    }
-
-    auto O = at::from_blob(workspace, {Q.size(1), Q.size(2), W.size(1)}, options);
-    unsigned m = W.size(1);
-    unsigned n = Q.size(1) * Q.size(2);
-    unsigned k = Q.size(0);
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   rocblas_operation_none,
-                   rocblas_operation_transpose,
-                   m,
-                   n,
-                   k,
-                   &alpha,
-                   &gemm_beta,
-                   (T*)W.data_ptr(),
-                   (T*)Q.data_ptr(),
-                   (T*)O.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
-#else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-    return O;
-}
-
-template <typename T>
-void attention_unfused(at::Tensor& prev_key_cont,
-                       at::Tensor& query_cont,
-                       at::Tensor& attn_mask,
-                       at::Tensor& prev_value_cont,
-                       at::Tensor& output,
-                       int& bsz,
-                       int& seq_len,
-                       int& soft_len,
-                       int& heads,
-                       float& norm_factor,
-                       bool triangular,
-                       bool recompute,
-                       bool local_attention,
-                       int window_size)
-{
-    auto options = at::TensorOptions()
-                       .dtype(query_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-    float alpha = norm_factor;
-    float gemm_beta = 0.0;
-    auto attn_score = at::empty({bsz, heads, seq_len, soft_len}, options);
-    int k = prev_value_cont.size(2) / heads;
-    rocblas_set_stream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
-                                soft_len,
-                                seq_len,
-                                k,
-                                &alpha,
-                                &gemm_beta,
-                                (T*)prev_key_cont.data_ptr(),
-                                (T*)query_cont.data_ptr(),
-                                (T*)attn_score.data_ptr(),
-                                rocblas_operation_none,
-                                rocblas_operation_none,
-                                soft_len * k,
-                                seq_len * k,
-                                seq_len * soft_len,
-                                bsz * heads,
-#ifdef __HIP_PLATFORM_HCC__
-                                rocblas_gemm_algo_standard);
-#else
-                                CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-    attn_score = ds_softmax<T>(
-        attn_score, attn_mask, triangular, recompute, local_attention, window_size, false);
-    alpha = 1.0;
-    cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
-                                k,
-                                seq_len,
-                                soft_len,
-                                &alpha,
-                                &gemm_beta,
-                                (T*)prev_value_cont.data_ptr(),
-                                (T*)attn_score.data_ptr(),
-                                (T*)output.data_ptr(),
-                                rocblas_operation_none,
-                                rocblas_operation_none,
-                                soft_len * k,
-                                seq_len * soft_len,
-                                seq_len * k,
-                                bsz * heads,
-#ifdef __HIP_PLATFORM_HCC__
-                                rocblas_gemm_algo_standard);
-#else
-                                CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-}
-
-template <typename T>
-std::vector<at::Tensor> ds_softmax_context(at::Tensor& query,
-                                           at::Tensor& prev_key,
-                                           at::Tensor& new_key,
-                                           at::Tensor& attn_mask,
-                                           at::Tensor& prev_value,
-                                           at::Tensor& new_value,
-                                           int heads,
-                                           float norm_factor,
-                                           bool merging,
-                                           bool triangular,
-                                           bool local_attention,
-                                           int window_size,
-                                           bool no_masking)
-{
-    auto query_cont = query.contiguous();
-    auto prev_key_cont = prev_key.contiguous();
-    auto prev_value_cont = prev_value.contiguous();
-
-    int new_size = (new_value.sizes().size() > 1 ? new_value.size(1) : 0);
-
-    // Attn_Score [ batch Head Sequence-length Softmax-length]
-
-    int bsz = query_cont.size(0);
-    int seq_len = query_cont.size(1);
-    int soft_len = prev_value.size(1);
-
-    auto options = at::TensorOptions()
-                       .dtype(query_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output =
-        at::empty({prev_value.size(0), heads, seq_len, prev_value.size(2) / heads}, options);
-    attention_unfused<T>(prev_key_cont,
-                         query_cont,
-                         attn_mask,  //(no_masking ? nullptr : (T*)attn_mask.data_ptr()),
-                         prev_value_cont,
-                         output,
-                         bsz,
-                         seq_len,
-                         soft_len,
-                         heads,
-                         norm_factor,
-                         (triangular && (new_size == 0)),
-                         (new_size == 0),
-                         local_attention,
-                         window_size);
-
-    return {output, prev_key, prev_value};
-}
-
-template <typename T>
-at::Tensor ds_bias_gelu(at::Tensor& input, at::Tensor& bias)
-{
-    auto input_cont = input.contiguous();
-
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    int intermediate_size = input_cont.size(2);
-
-    launch_bias_gelu((T*)input_cont.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     intermediate_size,
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-    return input_cont;
-}
-
-template <typename T>
-at::Tensor ds_bias_residual(at::Tensor& input, at::Tensor& residual, at::Tensor& bias)
-{
-    auto input_cont = input.contiguous();
-    auto residual_cont = residual.contiguous();
-
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    // launch_bias_residual((T*)input_cont.data_ptr(),
-    //                      (T*)residual_cont.data_ptr(),
-    //                      (T*)bias.data_ptr(),
-    //                      bsz,
-    //                      input_cont.size(2),
-    //                      (bias.size(0) > 1),
-    //                      Context::Instance().GetCurrentStream());
-    return input_cont;
-}
-
-template <typename T>
-at::Tensor ds_layernorm(at::Tensor& input_cont, at::Tensor& gamma, at::Tensor& betta, float epsilon)
-{
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    auto inp_norm = at::empty_like(input_cont);
-    launch_layer_norm((T*)inp_norm.data_ptr(),
-                      (T*)input_cont.data_ptr(),
-                      (T*)gamma.data_ptr(),
-                      (T*)betta.data_ptr(),
-                      epsilon,
-                      bsz,
-                      input_cont.size(2),
-                      Context::Instance().GetCurrentStream());
-    return inp_norm;
-}
-
-template <typename T>
-at::Tensor qkv_unfused_cublas(at::Tensor& output,
-                              at::Tensor& input,
-                              at::Tensor& weight,
-                              at::Tensor& bias,
-                              at::Tensor& gamma,
-                              at::Tensor& beta,
-                              const float epsilon,
-                              bool add_bias)
-{
-    auto inp_norm = ds_layernorm<T>(input, gamma, beta, epsilon);
-
-    // hipEventRecord(Context::Instance().GetCompEvent(1), Context::Instance().GetCurrentStream());
-
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    int bsz = input.size(0) * input.size(1);
-    rocblas_set_stream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   rocblas_operation_none,
-                   rocblas_operation_none,
-                   weight.size(1),
-                   bsz,
-                   input.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)inp_norm.data_ptr(),
-                   (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
-#else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-    if (add_bias)
-        launch_bias_add((T*)output.data_ptr(),
-                        (T*)bias.data_ptr(),
-                        weight.size(1),
-                        bsz,
-                        Context::Instance().GetCurrentStream());
-    return inp_norm;
-}
-
-template <typename T>
-std::vector<at::Tensor> ds_qkv_gemm(at::Tensor& input,
-                                    at::Tensor& weight,
-                                    at::Tensor& bias,
-                                    at::Tensor& gamma,
-                                    at::Tensor& beta,
-                                    const float epsilon,
-                                    bool add_bias)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    auto inp_norm =
-        qkv_unfused_cublas<T>(output, input_cont, weight, bias, gamma, beta, epsilon, add_bias);
-
-    return {output, inp_norm};
-}
-
-template <typename T>
-void quantized_gemm(at::Tensor& output,
-                    at::Tensor& input,
-                    at::Tensor& weight,
-                    at::Tensor& qscale,
-                    int groups,
-                    int merge_count)
-{
-    int bsz = input.size(0) * input.size(1);
-    auto options = at::TensorOptions()
-                       .dtype(input.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-    auto weight16 = at::empty({weight.size(0), weight.size(1)}, options);
-
-    launch_dequantize((T*)weight16.data_ptr(),
-                      (int8_t*)weight.data_ptr(),
-                      (float*)qscale.data_ptr(),
-                      weight.size(1),
-                      weight.size(0),
-                      groups,
-                      merge_count,
-                      Context::Instance().GetCurrentStream());
-
-    rocblas_set_stream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   rocblas_operation_none,
-                   rocblas_operation_none,
-                   weight.size(1),
-                   bsz,
-                   input.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight16.data_ptr(),
-                   (T*)input.data_ptr(),
-                   (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
-#else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-}
-
-template <typename T>
-at::Tensor ds_qkv_gemm_int8(at::Tensor& input,
-                            at::Tensor& weight,
-                            at::Tensor& bias,
-                            at::Tensor& gamma,
-                            at::Tensor& beta,
-                            const float epsilon,
-                            at::Tensor& q_scale,
-                            int groups,
-                            bool add_bias)
-{
-    int bsz = input.size(0) * input.size(1);
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    auto inp_norm = ds_layernorm<T>(input_cont, gamma, beta, epsilon);
-
-    quantized_gemm<T>(output, inp_norm, weight, q_scale, groups, 0);
-    if (add_bias)
-        launch_bias_add((T*)output.data_ptr(),
-                        (T*)bias.data_ptr(),
-                        weight.size(1),
-                        bsz,
-                        Context::Instance().GetCurrentStream());
-
-    return output;
-}
-
-template <typename T>
-at::Tensor ds_linear_layer(at::Tensor& input, at::Tensor& weight, at::Tensor& bias)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    rocblas_set_stream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   rocblas_operation_none,
-                   rocblas_operation_none,
-                   weight.size(1),
-                   bsz,
-                   input_cont.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)input_cont.data_ptr(),
-                   (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
-#else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-
-    launch_bias_add((T*)output.data_ptr(),
-                    (T*)bias.data_ptr(),
-                    weight.size(1),
-                    bsz,
-                    Context::Instance().GetCurrentStream());
-
-    return output;
-}
-
-template <typename T>
-at::Tensor ds_linear_layer_int8(at::Tensor& input,
-                                at::Tensor& weight,
-                                at::Tensor& bias,
-                                at::Tensor& q_scale,
-                                int groups)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    quantized_gemm<T>(output, input_cont, weight, q_scale, groups, 0);
-    launch_bias_add((T*)output.data_ptr(),
-                    (T*)bias.data_ptr(),
-                    weight.size(1),
-                    bsz,
-                    Context::Instance().GetCurrentStream());
-    return output;
-}
-
-template <typename T>
-at::Tensor ds_vector_matmul(at::Tensor& input, at::Tensor& weight, bool async_op)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    rocblas_set_stream(Context::Instance().GetCublasHandle(),
-                    Context::Instance().GetCurrentStream(async_op));
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   rocblas_operation_none,
-                   rocblas_operation_none,
-                   weight.size(1),
-                   bsz,
-                   input_cont.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)input_cont.data_ptr(),
-                   (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
-#else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-    return output;
-}
-
-template <typename T>
-at::Tensor ds_vector_matmul_int8(at::Tensor& input,
-                                 at::Tensor& weight,
-                                 at::Tensor& q_scale,
-                                 int groups,
-                                 int merge_count)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    quantized_gemm<T>(output, input_cont, weight, q_scale, groups, merge_count);
-    return output;
-}
-
-template <typename T>
-void mlp_unfused_cublas(at::Tensor& output,
-                        at::Tensor& input,
-                        at::Tensor& residual,
-                        at::Tensor& input_bias,
-                        at::Tensor& weight,
-                        at::Tensor& bias,
-                        at::Tensor& gamma,
-                        at::Tensor& beta,
-                        const float epsilon,
-                        bool preLayerNorm,
-                        bool mlp_after_attn)
-{
-    int bsz = input.size(0) * input.size(1);
-    auto inp_norm = at::empty_like(input);
-
-    launch_residual_layer_norm((T*)inp_norm.data_ptr(),
-                               (T*)nullptr,
-                               (T*)input.data_ptr(),
-                               (T*)residual.data_ptr(),
-                               (T*)input_bias.data_ptr(),
-                               (T*)gamma.data_ptr(),
-                               (T*)beta.data_ptr(),
-                               epsilon,
-                               bsz,
-                               input.size(2),
-                               preLayerNorm,
-                               mlp_after_attn,
-                               Context::Instance().GetCurrentStream());
-
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    rocblas_set_stream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   rocblas_operation_none,
-                   rocblas_operation_none,
-                   weight.size(1),
-                   bsz,
-                   input.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)inp_norm.data_ptr(),
-                   (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
-#else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-    launch_bias_gelu((T*)output.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     weight.size(1),
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-}
-template <typename T>
-at::Tensor ds_mlp_gemm(at::Tensor& input,
-                       at::Tensor& residual,
-                       at::Tensor& input_bias,
-                       at::Tensor& weight,
-                       at::Tensor& bias,
-                       at::Tensor& gamma,
-                       at::Tensor& beta,
-                       const float epsilon,
-                       bool preLayerNorm,
-                       bool mlp_after_attn)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-
-    mlp_unfused_cublas<T>(output,
-                          mlp_after_attn ? input : residual,
-                          residual,
-                          input_bias,
-                          weight,
-                          bias,
-                          gamma,
-                          beta,
-                          epsilon,
-                          preLayerNorm,
-                          mlp_after_attn);
-
-    return output;
-}
-
-template <typename T>
-std::vector<at::Tensor> ds_mlp_gemm_int8(at::Tensor& input,
-                                         at::Tensor& residual,
-                                         at::Tensor& input_bias,
-                                         at::Tensor& weight,
-                                         at::Tensor& bias,
-                                         at::Tensor& gamma,
-                                         at::Tensor& beta,
-                                         const float epsilon,
-                                         at::Tensor& q_scale,
-                                         int groups,
-                                         bool preLayerNorm)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    auto inp_norm = at::empty_like(input_cont);
-
-    auto residual_add = (preLayerNorm ? at::empty_like(input_cont) : inp_norm);
-    // computing the blocking across K dimension
-    // launch_residual_layer_norm((T*)inp_norm.data_ptr(),
-    //                           (T*)residual_add.data_ptr(),
-    //                           (T*)input_cont.data_ptr(),
-    //                           (T*)residual.data_ptr(),
-    //                           (T*)input_bias.data_ptr(),
-    //                           (T*)gamma.data_ptr(),
-    //                           (T*)beta.data_ptr(),
-    //                           epsilon,
-    //                           bsz,
-    //                           input_cont.size(2),
-    //                           preLayerNorm,
-    //                           Context::Instance().GetCurrentStream());
-
-    quantized_gemm<T>(output, inp_norm, weight, q_scale, groups, 0);
-    launch_bias_gelu((T*)output.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     weight.size(1),
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-
-    return {output, residual_add};
-}
-
-template <typename T>
-at::Tensor fused_gemm_gelu(at::Tensor& input,
-                           at::Tensor& weight,
-                           at::Tensor& bias,
-                           at::Tensor& weight_out,
-                           const float epsilon,
-                           bool preLayerNorm,
-                           bool async_op)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto intermediate =
-        at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight_out.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    rocblas_set_stream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   rocblas_operation_none,
-                   rocblas_operation_none,
-                   weight.size(1),
-                   bsz,
-                   input.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)input_cont.data_ptr(),
-                   (T*)intermediate.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
-#else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-    launch_bias_gelu((T*)intermediate.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     weight.size(1),
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   rocblas_operation_none,
-                   rocblas_operation_none,
-                   weight_out.size(1),
-                   bsz,
-                   intermediate.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight_out.data_ptr(),
-                   (T*)intermediate.data_ptr(),
-                   (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo_standard);
-#else
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-    // hipEventRecord(Context::Instance().GetCompEvent(2),
-    //                Context::Instance().GetCurrentStream(true));
-    return output;
-}
-
-void residual_add_bias(at::Tensor& output,
-                       at::Tensor& input,
-                       at::Tensor& attention_output,
-                       at::Tensor& output_b,
-                       at::Tensor& attention_b,
-                       int mp_size,
-                       bool mlp_after_attn)
-{
-    int bsz = input.size(0) * input.size(1);
-    int hidden_size = input.size(2);
-    // hipStreamWaitEvent(
-    //    Context::Instance().GetCurrentStream(), Context::Instance().GetCompEvent(2), 0);
-    if (input.scalar_type() == at::kFloat)
-        if (mlp_after_attn)
-            launch_bias_residual((float*)input.data_ptr(),
-                                 (float*)output.data_ptr(),
-                                 (float*)attention_output.data_ptr(),
-                                 (float*)output_b.data_ptr(),
-                                 (float*)attention_b.data_ptr(),
-                                 bsz,
-                                 hidden_size,
-                                 mp_size,
-                                 Context::Instance().GetCurrentStream());
-        else
-            launch_gptj_residual_add<float>((float*)input.data_ptr(),
-                                            (float*)output.data_ptr(),
-                                            (float*)attention_output.data_ptr(),
-                                            (float*)output_b.data_ptr(),
-                                            (float*)attention_b.data_ptr(),
-                                            hidden_size,
-                                            bsz,
-                                            mp_size,
-                                            Context::Instance().GetCurrentStream());
-    else if (mlp_after_attn)
-        launch_bias_residual((__half*)input.data_ptr(),
-                             (__half*)output.data_ptr(),
-                             (__half*)attention_output.data_ptr(),
-                             (__half*)output_b.data_ptr(),
-                             (__half*)attention_b.data_ptr(),
-                             bsz,
-                             hidden_size,
-                             mp_size,
-                             Context::Instance().GetCurrentStream());
-    else
-        launch_gptj_residual_add<__half>((__half*)input.data_ptr(),
-                                         (__half*)output.data_ptr(),
-                                         (__half*)attention_output.data_ptr(),
-                                         (__half*)output_b.data_ptr(),
-                                         (__half*)attention_b.data_ptr(),
-                                         hidden_size,
-                                         bsz,
-                                         mp_size,
-                                         Context::Instance().GetCurrentStream());
-}
-
-std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query,
-                                             at::Tensor& key_layer,
-                                             unsigned rotary_dim,
-                                             unsigned offset,
-                                             unsigned num_heads,
-                                             bool rotate_half,
-                                             bool rotate_every_two)
-{
-    auto query_cont = mixed_query.contiguous();
-    auto key_cont = key_layer.contiguous();
-
-    unsigned bsz = mixed_query.size(0);
-    unsigned head_size = mixed_query.size(2) / num_heads;
-    unsigned seq_len = mixed_query.size(1);
-
-    if (mixed_query.scalar_type() == at::kFloat)
-        launch_apply_rotary_pos_emb<float>((float*)query_cont.data_ptr(),
-                                           (float*)key_cont.data_ptr(),
-                                           head_size,
-                                           seq_len,
-                                           rotary_dim,
-                                           offset,
-                                           num_heads,
-                                           bsz,
-                                           rotate_half,
-                                           rotate_every_two,
-                                           Context::Instance().GetCurrentStream());
-    else
-        launch_apply_rotary_pos_emb<__half>((__half*)query_cont.data_ptr(),
-                                            (__half*)key_cont.data_ptr(),
-                                            head_size,
-                                            seq_len,
-                                            rotary_dim,
-                                            offset,
-                                            num_heads,
-                                            bsz,
-                                            rotate_half,
-                                            rotate_every_two,
-                                            Context::Instance().GetCurrentStream());
-    return {query_cont, key_cont};
-}
-
-template <typename T>
-at::Tensor fused_gemm_gelu_int8(at::Tensor& input,
-                                at::Tensor& weight,
-                                at::Tensor& bias,
-                                const float epsilon,
-                                at::Tensor& q_scale,
-                                int groups,
-                                bool preLayerNorm)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    int bsz = input_cont.size(0) * input_cont.size(1);
-
-    quantized_gemm<T>(output, input_cont, weight, q_scale, groups, 0);
-    launch_bias_gelu((T*)output.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     weight.size(1),
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-
-    return output;
-}
-
-at::Tensor moe_res_matmul(at::Tensor& moe_res, at::Tensor& coef, at::Tensor& output)
-{
-    int M = moe_res.size(0) * moe_res.size(1);
-    int N = moe_res.size(2);
-    Context::Instance().SynchComm();
-    if (moe_res.scalar_type() == at::kFloat) {
-        launch_moe_res_matmul<float>((float*)moe_res.data_ptr(),
-                                     (float*)coef.data_ptr(),
-                                     (float*)output.data_ptr(),
-                                     M,
-                                     N,
-                                     at::hip::getCurrentHIPStreamMasqueradingAsCUDA());
-    } else {
-        launch_moe_res_matmul<__half>((__half*)moe_res.data_ptr(),
-                                      (__half*)coef.data_ptr(),
-                                      (__half*)output.data_ptr(),
-                                      M,
-                                      N,
-                                      at::hip::getCurrentHIPStreamMasqueradingAsCUDA());
-    }
-    return output;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("softmax_fp32", &ds_softmax<float>, "DeepSpeed SoftMax with fp32 (CUDA)");
-    m.def("softmax_fp16", &ds_softmax<__half>, "DeepSpeed SoftMax with fp32 (CUDA)");
-    m.def(
-        "softmax_context_fp32", &ds_softmax_context<float>, "DeepSpeed attention with fp32 (CUDA)");
-    m.def("softmax_context_fp16",
-          &ds_softmax_context<__half>,
-          "DeepSpeed attention with fp32 (CUDA)");
-    m.def("bias_gelu_fp32", &ds_bias_gelu<float>, "DeepSpeed Gelu with fp32 (CUDA)");
-    m.def("bias_gelu_fp16", &ds_bias_gelu<__half>, "DeepSpeed Gelu with fp32 (CUDA)");
-    m.def("bias_residual_fp32",
-          &ds_bias_residual<float>,
-          "DeepSpeed residual-bias add with fp32 (CUDA)");
-    m.def("bias_residual_fp16",
-          &ds_bias_residual<__half>,
-          "DeepSpeed residual-bias add with fp32 (CUDA)");
-    m.def("layer_norm_fp32", &ds_layernorm<float>, "DeepSpeed layer-norm with fp32 (CUDA)");
-    m.def("layer_norm_fp16", &ds_layernorm<__half>, "DeepSpeed layer-norm with fp16 (CUDA)");
-    m.def("qkv_gemm_fp32", &ds_qkv_gemm<float>, "DeepSpeed qkv gemm with fp32 (CUDA)");
-    m.def("qkv_gemm_fp16", &ds_qkv_gemm<__half>, "DeepSpeed qkv gemm with fp16 (CUDA)");
-    m.def("qkv_gemm_int8", &ds_qkv_gemm_int8<__half>, "DeepSpeed qkv gemm with int8 (CUDA)");
-    m.def("mlp_gemm_fp32", &ds_mlp_gemm<float>, "DeepSpeed mlp with fp32 (CUDA)");
-    m.def("mlp_gemm_fp16", &ds_mlp_gemm<__half>, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("mlp_gemm_int8", &ds_mlp_gemm_int8<__half>, "DeepSpeed mlp with int8 (CUDA)");
-    m.def("vector_matmul_fp32", &ds_vector_matmul<float>, "DeepSpeed vector-MM with fp32 (CUDA)");
-    m.def("vector_matmul_fp16", &ds_vector_matmul<__half>, "DeepSpeed vector-MM with fp16 (CUDA)");
-    m.def("vector_matmul_int8",
-          &ds_vector_matmul_int8<__half>,
-          "DeepSpeed vector-MM with int8 (CUDA)");
-    m.def("linear_layer_fp32", &ds_linear_layer<float>, "DeepSpeed linear_layer with fp32 (CUDA)");
-    m.def("linear_layer_fp16", &ds_linear_layer<__half>, "DeepSpeed linear_layer with fp16 (CUDA)");
-    m.def("linear_layer_int8",
-          &ds_linear_layer_int8<__half>,
-          "DeepSpeed linear_layer with int8 (CUDA)");
-    m.def("fused_gemm_gelu_fp32", &fused_gemm_gelu<float>, "DeepSpeed mlp with fp32 (CUDA)");
-    m.def("fused_gemm_gelu_fp16", &fused_gemm_gelu<__half>, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("residual_add", &residual_add_bias, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("apply_rotary_pos_emb", &apply_rotary_pos_emb, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("einsum_sec_sm_ecm_fp32",
-          &einsum_sec_sm_ecm<float>,
-          "DeepSpeed vector-MM with fp32 (CUDA)");
-
-    m.def("einsum_sec_sm_ecm_fp16",
-          &einsum_sec_sm_ecm<__half>,
-          "DeepSpeed vector-MM with fp16 (CUDA)");
-    m.def("moe_res_matmul", &moe_res_matmul, "DeepSpeed moe residual matmul (CUDA)");
-}
diff --git a/deepspeed/ops/csrc/transformer/inference/csrc/softmax.cu b/deepspeed/ops/csrc/transformer/inference/csrc/softmax.cu
deleted file mode 100644
index bf3c8bc90049ddd9cf91ce4006729d02ebcdcf3e..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/inference/csrc/softmax.cu
+++ /dev/null
@@ -1,434 +0,0 @@
-#include <limits>
-#include "custom_cuda_layers.h"
-
-#ifndef __HIP_PLATFORM_HCC__
-#include <cuda_profiler_api.h>
-#endif
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-
-#define ATTN_THREADS 1024
-#define MAX_REG_SIZE 8
-
-#define minus_infinity -10000.0
-
-void CheckCudaErrorAux(const char* file, unsigned line)
-{
-    cudaError_t err = cudaGetLastError();
-    if (err == cudaSuccess) return;
-    std::cerr << cudaGetErrorString(err) << "(" << err << ") at " << file << ":" << line
-              << std::endl;
-    throw std::runtime_error("CUDA ERROR!!!\n");
-}
-
-#define CUDA_CHECK_ERROR() CheckCudaErrorAux(__FILE__, __LINE__)
-
-namespace cg = cooperative_groups;
-
-__global__ void attn_softmax_v2(__half* vals,
-                                __half* mask,
-                                bool triangular,
-                                bool recompute,
-                                bool local_attention,
-                                int window_size,
-                                int total_count,
-                                int heads,
-                                int sequence_length,
-                                int num_seq,
-                                float scale,
-                                int iterations,
-                                int reduceWidth)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    float2 low_data[MAX_REG_SIZE];
-    float2 high_data[MAX_REG_SIZE];
-
-    __half2 h_scale = __float2half2_rn(scale);
-
-    int wid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-
-    int reduce_blocks = reduceWidth >> 5;
-    int seq_lane = threadIdx.x % reduceWidth;
-
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int iter_offset = blockIdx.x * (warp_num / reduce_blocks) + (wid / reduce_blocks);
-
-    if (iter_offset < total_count) {
-        vals += (iter_offset * sequence_length);
-
-        int mask_offset = (iter_offset / (heads * num_seq)) * (sequence_length);
-        int seq_id = iter_offset % num_seq;
-        int seq_id4 = seq_id >> 2;
-
-        int real_seq_id = seq_id + (num_seq == sequence_length ? 0 : sequence_length);
-        int window_stride4 = (local_attention && (real_seq_id >> 2) > (window_size >> 2))
-                                 ? (real_seq_id >> 2) - (window_size >> 2)
-                                 : 0;
-        int window_stride =
-            (local_attention && real_seq_id >= window_size) ? real_seq_id - window_size : -1;
-
-        float max_val = minus_infinity;
-
-        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-            if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
-                data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    low_data[i].x = data_id > window_stride ? __half2float(vals[data_id])
-                                                            : minus_infinity;
-                    low_data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
-                                     (data_id + 1) > window_stride)
-                                        ? __half2float(vals[data_id + 1])
-                                        : minus_infinity;
-                    high_data[i].x = ((!triangular || ((data_id + 2) <= seq_id)) &&
-                                      (data_id + 2) > window_stride)
-                                         ? __half2float(vals[data_id + 2])
-                                         : minus_infinity;
-                    high_data[i].y = ((!triangular || ((data_id + 3) <= seq_id)) &&
-                                      (data_id + 3) > window_stride)
-                                         ? __half2float(vals[data_id + 3])
-                                         : minus_infinity;
-                    if (mask && recompute) {
-                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
-                        low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
-                        high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
-                        high_data[i].y += __half2float(mask[data_id + mask_offset + 3]);
-                    }
-                } else {
-                    low_data[i].x = data_id > window_stride ? __half2float(vals[data_id])
-                                                            : minus_infinity;
-                    low_data[i].y = (((!triangular || (data_id + 1) <= seq_id) &&
-                                      (data_id + 1) > window_stride) &&
-                                     (data_id + 1) < sequence_length)
-                                        ? __half2float(vals[data_id + 1])
-                                        : minus_infinity;
-                    high_data[i].x = (((!triangular || (data_id + 2) <= seq_id) &&
-                                       (data_id + 2) > window_stride) &&
-                                      (data_id + 2) < sequence_length)
-                                         ? __half2float(vals[data_id + 2])
-                                         : minus_infinity;
-                    high_data[i].y = minus_infinity;
-                    if (mask && recompute) {
-                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
-                        if ((data_id + 1) < sequence_length)
-                            low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
-                        if ((data_id + 2) < sequence_length)
-                            high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
-                    }
-                }
-                // if(lane == 0) printf("%f , %d, %d \n", low_data[i].x, data_id, seq_id);
-                max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
-                max_val = (low_data[i].y > max_val ? low_data[i].y : max_val);
-                max_val = (high_data[i].x > max_val ? high_data[i].x : max_val);
-                max_val = (high_data[i].y > max_val ? high_data[i].y : max_val);
-            } else {
-                low_data[i].x = minus_infinity;
-                low_data[i].y = minus_infinity;
-                high_data[i].x = minus_infinity;
-                high_data[i].y = minus_infinity;
-            }
-        }
-
-        for (int i = 1; i < WARP_SIZE; i *= 2) {
-            auto temp = g.shfl_xor(max_val, i);
-            max_val = (temp > max_val ? temp : max_val);
-        }
-
-        if (reduceWidth > WARP_SIZE) {
-            if (lane == 0) partialSum[wid] = max_val;
-            b.sync();
-
-            if (lane < warp_num) max_val = partialSum[lane];
-
-            b.sync();
-
-            for (int i = 1; i < reduce_blocks; i *= 2) {
-                auto temp = g.shfl_xor(max_val, i);
-                max_val = (temp > max_val ? temp : max_val);
-            }
-
-            max_val = g.shfl(max_val, threadIdx.x / WARP_SIZE);
-        }
-        float sum = 0;
-        for (int i = 0; i < iterations; i++) {
-            low_data[i].x = __expf(low_data[i].x - max_val);
-            low_data[i].y = __expf(low_data[i].y - max_val);
-            high_data[i].x = __expf(high_data[i].x - max_val);
-            high_data[i].y = __expf(high_data[i].y - max_val);
-
-            sum += (low_data[i].x + low_data[i].y + high_data[i].x + high_data[i].y);
-        }
-
-        for (int i = 1; i < WARP_SIZE; i *= 2) sum += g.shfl_xor(sum, i);
-
-        if (reduceWidth > WARP_SIZE) {
-            if (lane == 0) partialSum[wid] = sum;
-            b.sync();
-
-            if (lane < warp_num) sum = partialSum[lane];
-
-            b.sync();
-
-            for (int i = 1; i < reduce_blocks; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-            sum = g.shfl(sum, threadIdx.x / WARP_SIZE);
-        }
-        sum += 1e-6;
-        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-
-            if (data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    vals[data_id] = low_data[i].x / sum;
-                    vals[data_id + 1] = low_data[i].y / sum;
-                    vals[data_id + 2] = high_data[i].x / sum;
-                    vals[data_id + 3] = high_data[i].y / sum;
-                } else {
-                    vals[data_id] = low_data[i].x / sum;
-                    if ((data_id + 1) < sequence_length) vals[data_id + 1] = low_data[i].y / sum;
-                    if ((data_id + 2) < sequence_length) vals[data_id + 2] = high_data[i].x / sum;
-                }
-            }
-        }
-    }
-#endif
-}
-
-__global__ void attn_softmax_v2(float* vals,
-                                float* attn_mask,
-                                bool triangular,
-                                bool recompute,
-                                bool local_attention,
-                                int window_size,
-                                int total_count,
-                                int heads,
-                                int sequence_length,
-                                int num_seq,
-                                float scale,
-                                int iterations,
-                                int reduceWidth)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    float4 data[MAX_REG_SIZE];
-
-    int wid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-
-    int reduce_blocks = reduceWidth >> 5;
-    int seq_lane = threadIdx.x % reduceWidth;
-
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int iter_offset = blockIdx.x * (warp_num / reduce_blocks) + (wid / reduce_blocks);
-    if (iter_offset < total_count) {
-        vals += (iter_offset * sequence_length);
-
-        int mask_offset = (iter_offset / (heads * num_seq)) * (sequence_length);
-        int seq_id = iter_offset % num_seq;
-        int seq_id4 = seq_id >> 2;
-
-        int real_seq_id = seq_id + (num_seq == sequence_length ? 0 : sequence_length);
-        int window_stride4 = (local_attention && (real_seq_id >> 2) > (window_size >> 2))
-                                 ? (real_seq_id >> 2) - (window_size >> 2)
-                                 : 0;
-        int window_stride =
-            (local_attention && real_seq_id >= window_size) ? real_seq_id - window_size : -1;
-
-        float max_val = minus_infinity;
-
-        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-            if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
-                data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    data[i].x = (data_id > window_stride ? vals[data_id] : minus_infinity);
-                    data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
-                                 (data_id + 1) > window_stride)
-                                    ? vals[data_id + 1]
-                                    : minus_infinity;
-                    data[i].z = ((!triangular || ((data_id + 2) <= seq_id)) &&
-                                 (data_id + 2) > window_stride)
-                                    ? vals[data_id + 2]
-                                    : minus_infinity;
-                    data[i].w = ((!triangular || ((data_id + 3) <= seq_id)) &&
-                                 (data_id + 3) > window_stride)
-                                    ? vals[data_id + 3]
-                                    : minus_infinity;
-                    if (attn_mask && recompute) {
-                        data[i].x += attn_mask[data_id + mask_offset];
-                        data[i].y += attn_mask[data_id + mask_offset + 1];
-                        data[i].z += attn_mask[data_id + mask_offset + 2];
-                        data[i].w += attn_mask[data_id + mask_offset + 3];
-                    }
-                } else {
-                    data[i].x = data_id > window_stride ? vals[data_id] : minus_infinity;
-                    data[i].y = (((!triangular || (data_id + 1) <= seq_id)) &&
-                                 (data_id + 1) > window_stride && (data_id + 1) < sequence_length)
-                                    ? (vals[data_id + 1])
-                                    : minus_infinity;
-                    data[i].z = (((!triangular || (data_id + 2) <= seq_id)) &&
-                                 (data_id + 2) > window_stride && (data_id + 2) < sequence_length)
-                                    ? (vals[data_id + 2])
-                                    : minus_infinity;
-                    data[i].w = minus_infinity;
-                    if (attn_mask && recompute) {
-                        data[i].x += attn_mask[data_id + mask_offset];
-                        if ((data_id + 1) < sequence_length)
-                            data[i].y += attn_mask[data_id + mask_offset + 1];
-                        if ((data_id + 2) < sequence_length)
-                            data[i].z += attn_mask[data_id + mask_offset + 2];
-                    }
-                }
-                max_val = (data[i].x > max_val ? data[i].x : max_val);
-                max_val = (data[i].y > max_val ? data[i].y : max_val);
-                max_val = (data[i].z > max_val ? data[i].z : max_val);
-                max_val = (data[i].w > max_val ? data[i].w : max_val);
-            } else {
-                data[i].x = minus_infinity;
-                data[i].y = minus_infinity;
-                data[i].z = minus_infinity;
-                data[i].w = minus_infinity;
-            }
-        }
-
-        for (int i = 1; i < WARP_SIZE; i *= 2) {
-            auto temp = g.shfl_xor(max_val, i);
-            max_val = (temp > max_val ? temp : max_val);
-        }
-
-        if (reduceWidth > WARP_SIZE) {
-            if (lane == 0) partialSum[wid] = max_val;
-            b.sync();
-
-            if (lane < warp_num) max_val = partialSum[lane];
-
-            b.sync();
-
-            for (int i = 1; i < reduce_blocks; i *= 2) {
-                auto temp = g.shfl_xor(max_val, i);
-                max_val = (temp > max_val ? temp : max_val);
-            }
-
-            max_val = g.shfl(max_val, threadIdx.x / WARP_SIZE);
-        }
-
-        float sum = 0;
-        for (int i = 0; i < iterations; i++) {
-            data[i].x = __expf(data[i].x - max_val);
-            data[i].y = __expf(data[i].y - max_val);
-            data[i].z = __expf(data[i].z - max_val);
-            data[i].w = __expf(data[i].w - max_val);
-
-            sum += (data[i].x + data[i].y + data[i].z + data[i].w);
-        }
-
-        for (int i = 1; i < WARP_SIZE; i *= 2) sum += g.shfl_xor(sum, i);
-
-        if (reduceWidth > WARP_SIZE) {
-            if (lane == 0) partialSum[wid] = sum;
-            b.sync();
-
-            if (lane < warp_num) sum = partialSum[lane];
-
-            b.sync();
-
-            for (int i = 1; i < reduce_blocks; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-            sum = g.shfl(sum, threadIdx.x / WARP_SIZE);
-        }
-        sum += 1e-6;
-
-        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-
-            if (data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    vals[data_id] = data[i].x / sum;
-                    vals[data_id + 1] = data[i].y / sum;
-                    vals[data_id + 2] = data[i].z / sum;
-                    vals[data_id + 3] = data[i].w / sum;
-                } else {
-                    vals[data_id] = data[i].x / sum;
-                    if ((data_id + 1) < sequence_length) vals[data_id + 1] = data[i].y / sum;
-                    if ((data_id + 2) < sequence_length) vals[data_id + 2] = data[i].z / sum;
-                }
-            }
-        }
-    }
-}
-
-template <typename T>
-void launch_attn_softmax_v2(T* vals,
-                            T* mask,
-                            bool triangular,
-                            bool recompute,
-                            bool local_attention,
-                            int window_size,
-                            int batch_size,
-                            int heads,
-                            int num_seq,
-                            int sequence_length,
-                            float scale,
-                            cudaStream_t stream)
-{
-    int total_count = batch_size * heads * num_seq;
-    dim3 grid_dim((total_count - 1) / (WARP_SIZE / ((sequence_length - 1) / ATTN_THREADS + 1)) + 1);
-    dim3 block_dim(ATTN_THREADS);
-
-    const int reduce_width = ((sequence_length - 1) / ATTN_THREADS + 1) * WARP_SIZE;
-    const int iterations = (sequence_length - 1) / (reduce_width << 2) + 1;
-
-    if (sequence_length <= 32768)
-        attn_softmax_v2<<<grid_dim, block_dim, 0, stream>>>(
-            vals,
-            mask,
-            triangular,
-            recompute,
-            local_attention,
-            window_size,
-            total_count,
-            (triangular ? (heads * batch_size) : heads),
-            sequence_length,
-            num_seq,
-            scale,
-            iterations,
-            reduce_width);
-    else
-        throw std::runtime_error("Unsupport Seq_Length!");
-}
-
-template void launch_attn_softmax_v2(float* vals,
-                                     float* mask,
-                                     bool triangular,
-                                     bool recompute,
-                                     bool local_attention,
-                                     int window_size,
-                                     int batch_size,
-                                     int heads,
-                                     int num_seq,
-                                     int sequence_length,
-                                     float scale,
-                                     cudaStream_t stream);
-template void launch_attn_softmax_v2(__half* vals,
-                                     __half* mask,
-                                     bool triangular,
-                                     bool recompute,
-                                     bool local_attention,
-                                     int window_size,
-                                     int batch_size,
-                                     int heads,
-                                     int num_seq,
-                                     int sequence_length,
-                                     float scale,
-                                     cudaStream_t stream);
diff --git a/deepspeed/ops/csrc/transformer/inference/csrc/softmax.hip b/deepspeed/ops/csrc/transformer/inference/csrc/softmax.hip
deleted file mode 100644
index 51d5bef3a72436a23f910b3a73ada214b012389a..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/inference/csrc/softmax.hip
+++ /dev/null
@@ -1,436 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include <limits>
-#include "custom_hip_layers.h"
-
-#ifndef __HIP_PLATFORM_HCC__
-#include <cuda_profiler_api.h>
-#endif
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-
-#define ATTN_THREADS 1024
-#define MAX_REG_SIZE 8
-
-#define minus_infinity -10000.0
-
-void CheckCudaErrorAux(const char* file, unsigned line)
-{
-    hipError_t err = hipGetLastError();
-    if (err == hipSuccess) return;
-    std::cerr << hipGetErrorString(err) << "(" << err << ") at " << file << ":" << line
-              << std::endl;
-    throw std::runtime_error("CUDA ERROR!!!\n");
-}
-
-#define CUDA_CHECK_ERROR() CheckCudaErrorAux(__FILE__, __LINE__)
-
-namespace cg = cooperative_groups;
-
-__global__ void attn_softmax_v2(__half* vals,
-                                __half* mask,
-                                bool triangular,
-                                bool recompute,
-                                bool local_attention,
-                                int window_size,
-                                int total_count,
-                                int heads,
-                                int sequence_length,
-                                int num_seq,
-                                float scale,
-                                int iterations,
-                                int reduceWidth)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    float2 low_data[MAX_REG_SIZE];
-    float2 high_data[MAX_REG_SIZE];
-
-    __half2 h_scale = __float2half2_rn(scale);
-
-    int wid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-
-    int reduce_blocks = reduceWidth >> 5;
-    int seq_lane = threadIdx.x % reduceWidth;
-
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int iter_offset = blockIdx.x * (warp_num / reduce_blocks) + (wid / reduce_blocks);
-
-    if (iter_offset < total_count) {
-        vals += (iter_offset * sequence_length);
-
-        int mask_offset = (iter_offset / (heads * num_seq)) * (sequence_length);
-        int seq_id = iter_offset % num_seq;
-        int seq_id4 = seq_id >> 2;
-
-        int real_seq_id = seq_id + (num_seq == sequence_length ? 0 : sequence_length);
-        int window_stride4 = (local_attention && (real_seq_id >> 2) > (window_size >> 2))
-                                 ? (real_seq_id >> 2) - (window_size >> 2)
-                                 : 0;
-        int window_stride =
-            (local_attention && real_seq_id >= window_size) ? real_seq_id - window_size : -1;
-
-        float max_val = minus_infinity;
-
-        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-            if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
-                data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    low_data[i].x = data_id > window_stride ? __half2float(vals[data_id])
-                                                            : minus_infinity;
-                    low_data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
-                                     (data_id + 1) > window_stride)
-                                        ? __half2float(vals[data_id + 1])
-                                        : minus_infinity;
-                    high_data[i].x = ((!triangular || ((data_id + 2) <= seq_id)) &&
-                                      (data_id + 2) > window_stride)
-                                         ? __half2float(vals[data_id + 2])
-                                         : minus_infinity;
-                    high_data[i].y = ((!triangular || ((data_id + 3) <= seq_id)) &&
-                                      (data_id + 3) > window_stride)
-                                         ? __half2float(vals[data_id + 3])
-                                         : minus_infinity;
-                    if (mask && recompute) {
-                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
-                        low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
-                        high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
-                        high_data[i].y += __half2float(mask[data_id + mask_offset + 3]);
-                    }
-                } else {
-                    low_data[i].x = data_id > window_stride ? __half2float(vals[data_id])
-                                                            : minus_infinity;
-                    low_data[i].y = (((!triangular || (data_id + 1) <= seq_id) &&
-                                      (data_id + 1) > window_stride) &&
-                                     (data_id + 1) < sequence_length)
-                                        ? __half2float(vals[data_id + 1])
-                                        : minus_infinity;
-                    high_data[i].x = (((!triangular || (data_id + 2) <= seq_id) &&
-                                       (data_id + 2) > window_stride) &&
-                                      (data_id + 2) < sequence_length)
-                                         ? __half2float(vals[data_id + 2])
-                                         : minus_infinity;
-                    high_data[i].y = minus_infinity;
-                    if (mask && recompute) {
-                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
-                        if ((data_id + 1) < sequence_length)
-                            low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
-                        if ((data_id + 2) < sequence_length)
-                            high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
-                    }
-                }
-                // if(lane == 0) printf("%f , %d, %d \n", low_data[i].x, data_id, seq_id);
-                max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
-                max_val = (low_data[i].y > max_val ? low_data[i].y : max_val);
-                max_val = (high_data[i].x > max_val ? high_data[i].x : max_val);
-                max_val = (high_data[i].y > max_val ? high_data[i].y : max_val);
-            } else {
-                low_data[i].x = minus_infinity;
-                low_data[i].y = minus_infinity;
-                high_data[i].x = minus_infinity;
-                high_data[i].y = minus_infinity;
-            }
-        }
-
-        for (int i = 1; i < WARP_SIZE; i *= 2) {
-            auto temp = g.shfl_xor(max_val, i);
-            max_val = (temp > max_val ? temp : max_val);
-        }
-
-        if (reduceWidth > WARP_SIZE) {
-            if (lane == 0) partialSum[wid] = max_val;
-            b.sync();
-
-            if (lane < warp_num) max_val = partialSum[lane];
-
-            b.sync();
-
-            for (int i = 1; i < reduce_blocks; i *= 2) {
-                auto temp = g.shfl_xor(max_val, i);
-                max_val = (temp > max_val ? temp : max_val);
-            }
-
-            max_val = g.shfl(max_val, threadIdx.x / WARP_SIZE);
-        }
-        float sum = 0;
-        for (int i = 0; i < iterations; i++) {
-            low_data[i].x = __expf(low_data[i].x - max_val);
-            low_data[i].y = __expf(low_data[i].y - max_val);
-            high_data[i].x = __expf(high_data[i].x - max_val);
-            high_data[i].y = __expf(high_data[i].y - max_val);
-
-            sum += (low_data[i].x + low_data[i].y + high_data[i].x + high_data[i].y);
-        }
-
-        for (int i = 1; i < WARP_SIZE; i *= 2) sum += g.shfl_xor(sum, i);
-
-        if (reduceWidth > WARP_SIZE) {
-            if (lane == 0) partialSum[wid] = sum;
-            b.sync();
-
-            if (lane < warp_num) sum = partialSum[lane];
-
-            b.sync();
-
-            for (int i = 1; i < reduce_blocks; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-            sum = g.shfl(sum, threadIdx.x / WARP_SIZE);
-        }
-        sum += 1e-6;
-        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-
-            if (data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    vals[data_id] = low_data[i].x / sum;
-                    vals[data_id + 1] = low_data[i].y / sum;
-                    vals[data_id + 2] = high_data[i].x / sum;
-                    vals[data_id + 3] = high_data[i].y / sum;
-                } else {
-                    vals[data_id] = low_data[i].x / sum;
-                    if ((data_id + 1) < sequence_length) vals[data_id + 1] = low_data[i].y / sum;
-                    if ((data_id + 2) < sequence_length) vals[data_id + 2] = high_data[i].x / sum;
-                }
-            }
-        }
-    }
-#endif
-}
-
-__global__ void attn_softmax_v2(float* vals,
-                                float* attn_mask,
-                                bool triangular,
-                                bool recompute,
-                                bool local_attention,
-                                int window_size,
-                                int total_count,
-                                int heads,
-                                int sequence_length,
-                                int num_seq,
-                                float scale,
-                                int iterations,
-                                int reduceWidth)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    float4 data[MAX_REG_SIZE];
-
-    int wid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-
-    int reduce_blocks = reduceWidth >> 5;
-    int seq_lane = threadIdx.x % reduceWidth;
-
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int iter_offset = blockIdx.x * (warp_num / reduce_blocks) + (wid / reduce_blocks);
-    if (iter_offset < total_count) {
-        vals += (iter_offset * sequence_length);
-
-        int mask_offset = (iter_offset / (heads * num_seq)) * (sequence_length);
-        int seq_id = iter_offset % num_seq;
-        int seq_id4 = seq_id >> 2;
-
-        int real_seq_id = seq_id + (num_seq == sequence_length ? 0 : sequence_length);
-        int window_stride4 = (local_attention && (real_seq_id >> 2) > (window_size >> 2))
-                                 ? (real_seq_id >> 2) - (window_size >> 2)
-                                 : 0;
-        int window_stride =
-            (local_attention && real_seq_id >= window_size) ? real_seq_id - window_size : -1;
-
-        float max_val = minus_infinity;
-
-        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-            if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
-                data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    data[i].x = (data_id > window_stride ? vals[data_id] : minus_infinity);
-                    data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
-                                 (data_id + 1) > window_stride)
-                                    ? vals[data_id + 1]
-                                    : minus_infinity;
-                    data[i].z = ((!triangular || ((data_id + 2) <= seq_id)) &&
-                                 (data_id + 2) > window_stride)
-                                    ? vals[data_id + 2]
-                                    : minus_infinity;
-                    data[i].w = ((!triangular || ((data_id + 3) <= seq_id)) &&
-                                 (data_id + 3) > window_stride)
-                                    ? vals[data_id + 3]
-                                    : minus_infinity;
-                    if (attn_mask && recompute) {
-                        data[i].x += attn_mask[data_id + mask_offset];
-                        data[i].y += attn_mask[data_id + mask_offset + 1];
-                        data[i].z += attn_mask[data_id + mask_offset + 2];
-                        data[i].w += attn_mask[data_id + mask_offset + 3];
-                    }
-                } else {
-                    data[i].x = data_id > window_stride ? vals[data_id] : minus_infinity;
-                    data[i].y = (((!triangular || (data_id + 1) <= seq_id)) &&
-                                 (data_id + 1) > window_stride && (data_id + 1) < sequence_length)
-                                    ? (vals[data_id + 1])
-                                    : minus_infinity;
-                    data[i].z = (((!triangular || (data_id + 2) <= seq_id)) &&
-                                 (data_id + 2) > window_stride && (data_id + 2) < sequence_length)
-                                    ? (vals[data_id + 2])
-                                    : minus_infinity;
-                    data[i].w = minus_infinity;
-                    if (attn_mask && recompute) {
-                        data[i].x += attn_mask[data_id + mask_offset];
-                        if ((data_id + 1) < sequence_length)
-                            data[i].y += attn_mask[data_id + mask_offset + 1];
-                        if ((data_id + 2) < sequence_length)
-                            data[i].z += attn_mask[data_id + mask_offset + 2];
-                    }
-                }
-                max_val = (data[i].x > max_val ? data[i].x : max_val);
-                max_val = (data[i].y > max_val ? data[i].y : max_val);
-                max_val = (data[i].z > max_val ? data[i].z : max_val);
-                max_val = (data[i].w > max_val ? data[i].w : max_val);
-            } else {
-                data[i].x = minus_infinity;
-                data[i].y = minus_infinity;
-                data[i].z = minus_infinity;
-                data[i].w = minus_infinity;
-            }
-        }
-
-        for (int i = 1; i < WARP_SIZE; i *= 2) {
-            auto temp = g.shfl_xor(max_val, i);
-            max_val = (temp > max_val ? temp : max_val);
-        }
-
-        if (reduceWidth > WARP_SIZE) {
-            if (lane == 0) partialSum[wid] = max_val;
-            b.sync();
-
-            if (lane < warp_num) max_val = partialSum[lane];
-
-            b.sync();
-
-            for (int i = 1; i < reduce_blocks; i *= 2) {
-                auto temp = g.shfl_xor(max_val, i);
-                max_val = (temp > max_val ? temp : max_val);
-            }
-
-            max_val = g.shfl(max_val, threadIdx.x / WARP_SIZE);
-        }
-
-        float sum = 0;
-        for (int i = 0; i < iterations; i++) {
-            data[i].x = __expf(data[i].x - max_val);
-            data[i].y = __expf(data[i].y - max_val);
-            data[i].z = __expf(data[i].z - max_val);
-            data[i].w = __expf(data[i].w - max_val);
-
-            sum += (data[i].x + data[i].y + data[i].z + data[i].w);
-        }
-
-        for (int i = 1; i < WARP_SIZE; i *= 2) sum += g.shfl_xor(sum, i);
-
-        if (reduceWidth > WARP_SIZE) {
-            if (lane == 0) partialSum[wid] = sum;
-            b.sync();
-
-            if (lane < warp_num) sum = partialSum[lane];
-
-            b.sync();
-
-            for (int i = 1; i < reduce_blocks; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-            sum = g.shfl(sum, threadIdx.x / WARP_SIZE);
-        }
-        sum += 1e-6;
-
-        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-
-            if (data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    vals[data_id] = data[i].x / sum;
-                    vals[data_id + 1] = data[i].y / sum;
-                    vals[data_id + 2] = data[i].z / sum;
-                    vals[data_id + 3] = data[i].w / sum;
-                } else {
-                    vals[data_id] = data[i].x / sum;
-                    if ((data_id + 1) < sequence_length) vals[data_id + 1] = data[i].y / sum;
-                    if ((data_id + 2) < sequence_length) vals[data_id + 2] = data[i].z / sum;
-                }
-            }
-        }
-    }
-}
-
-template <typename T>
-void launch_attn_softmax_v2(T* vals,
-                            T* mask,
-                            bool triangular,
-                            bool recompute,
-                            bool local_attention,
-                            int window_size,
-                            int batch_size,
-                            int heads,
-                            int num_seq,
-                            int sequence_length,
-                            float scale,
-                            hipStream_t stream)
-{
-    int total_count = batch_size * heads * num_seq;
-    dim3 grid_dim((total_count - 1) / (WARP_SIZE / ((sequence_length - 1) / ATTN_THREADS + 1)) + 1);
-    dim3 block_dim(ATTN_THREADS);
-
-    const int reduce_width = ((sequence_length - 1) / ATTN_THREADS + 1) * WARP_SIZE;
-    const int iterations = (sequence_length - 1) / (reduce_width << 2) + 1;
-
-    if (sequence_length <= 32768)
-       hipLaunchKernelGGL(( attn_softmax_v2), dim3(grid_dim), dim3(block_dim), 0, stream, 
-            vals,
-            mask,
-            triangular,
-            recompute,
-            local_attention,
-            window_size,
-            total_count,
-            (triangular ? (heads * batch_size) : heads),
-            sequence_length,
-            num_seq,
-            scale,
-            iterations,
-            reduce_width);
-    else
-        throw std::runtime_error("Unsupport Seq_Length!");
-}
-
-template void launch_attn_softmax_v2(float* vals,
-                                     float* mask,
-                                     bool triangular,
-                                     bool recompute,
-                                     bool local_attention,
-                                     int window_size,
-                                     int batch_size,
-                                     int heads,
-                                     int num_seq,
-                                     int sequence_length,
-                                     float scale,
-                                     hipStream_t stream);
-template void launch_attn_softmax_v2(__half* vals,
-                                     __half* mask,
-                                     bool triangular,
-                                     bool recompute,
-                                     bool local_attention,
-                                     int window_size,
-                                     int batch_size,
-                                     int heads,
-                                     int num_seq,
-                                     int sequence_length,
-                                     float scale,
-                                     hipStream_t stream);
diff --git a/deepspeed/ops/csrc/transformer/inference/includes/custom_cuda_layers.h b/deepspeed/ops/csrc/transformer/inference/includes/custom_cuda_layers.h
deleted file mode 100644
index 06b4340061c98c65b4b301c7349d2da03185f715..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/inference/includes/custom_cuda_layers.h
+++ /dev/null
@@ -1,124 +0,0 @@
-#pragma once
-
-#ifdef __HIP_PLATFORM_HCC__
-#define HALF_PRECISION_AVAILABLE = 1
-#include <hip/hip_cooperative_groups.h>
-#else
-#if __CUDA_ARCH__ >= 700
-#define HALF_PRECISION_AVAILABLE = 1
-#endif
-#include <cooperative_groups.h>
-#endif
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <cassert>
-#include <iostream>
-
-#define MAX_WARP_NUM 32
-#define WARP_SIZE 32
-#define SMs 80
-
-#define MAX_REGISTERS 256
-template <typename T>
-void launch_attn_softmax_v2(T* vals,
-                            T* mask,
-                            bool triangular,
-                            bool recompute,
-                            bool local_attention,
-                            int window_size,
-                            int batch_size,
-                            int heads,
-                            int num_seq,
-                            int sequence_length,
-                            float scale,
-                            cudaStream_t stream);
-
-// Fused bias add with gelu activation
-template <typename T>
-void launch_bias_gelu(T* input,
-                      const T* bias,
-                      int intermediate_size,
-                      int batch_size,
-                      cudaStream_t stream);
-template <typename T>
-void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, cudaStream_t stream);
-
-template <typename T>
-void launch_bias_residual(T* input,
-                          T* output,
-                          T* attn,
-                          T* bias,
-                          T* attn_bias,
-                          int batch,
-                          int hidden_dim,
-                          int mp_size,
-                          cudaStream_t stream);
-
-template <typename T>
-void launch_layer_norm(T* out,
-                       T* vals,
-                       const T* gamma,
-                       const T* beta,
-                       float epsilon,
-                       int batch_size,
-                       int hidden_dim,
-                       cudaStream_t stream);
-
-template <typename T>
-void launch_residual_layer_norm(T* norm,
-                                T* res_add,
-                                T* vals,
-                                T* residual,
-                                const T* bias,
-                                const T* gamma,
-                                const T* beta,
-                                float epsilon,
-                                int batch_size,
-                                int hidden_dim,
-                                bool preLN,
-                                bool mlp_after_attn,
-                                cudaStream_t stream);
-template <typename T>
-void launch_dequantize(T* output,
-                       const int8_t* input,
-                       const float* qscale,
-                       unsigned output_size,
-                       unsigned hidden_dim,
-                       unsigned groups,
-                       unsigned merge_count,
-                       cudaStream_t stream);
-
-template <typename T>
-void launch_gptj_residual_add(T* input,
-                              T* output,
-                              T* attn,
-                              T* bias,
-                              T* attn_bias,
-                              int batch,
-                              int head_size,
-                              int mp_size,
-                              cudaStream_t stream);
-
-template <typename T>
-void launch_apply_rotary_pos_emb(T* mixed_query,
-                                 T* key_layer,
-                                 unsigned head_size,
-                                 unsigned seq_len,
-                                 unsigned rotary_dim,
-                                 unsigned offset,
-                                 unsigned num_heads,
-                                 unsigned batch,
-                                 bool rotate_half,
-                                 bool rotate_every_two,
-                                 cudaStream_t stream);
-
-template <typename T>
-void launch_moe_res_matmul(T* residual,
-                           T* coef,
-                           T* mlp_out,
-                           int seq_len,
-                           int hidden_dim,
-                           cudaStream_t stream);
diff --git a/deepspeed/ops/csrc/transformer/inference/includes/custom_hip_layers.h b/deepspeed/ops/csrc/transformer/inference/includes/custom_hip_layers.h
deleted file mode 100644
index 36cab34d6262f5d6211a18584f6d55284c04846e..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/inference/includes/custom_hip_layers.h
+++ /dev/null
@@ -1,125 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#ifdef __HIP_PLATFORM_HCC__
-#define HALF_PRECISION_AVAILABLE = 1
-#include <hip/hip_cooperative_groups.h>
-#else
-#if __CUDA_ARCH__ >= 700
-#define HALF_PRECISION_AVAILABLE = 1
-#endif
-#include <cooperative_groups.h>
-#endif
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <cassert>
-#include <iostream>
-
-#define MAX_WARP_NUM 32
-#define WARP_SIZE 32
-#define SMs 80
-
-#define MAX_REGISTERS 256
-template <typename T>
-void launch_attn_softmax_v2(T* vals,
-                            T* mask,
-                            bool triangular,
-                            bool recompute,
-                            bool local_attention,
-                            int window_size,
-                            int batch_size,
-                            int heads,
-                            int num_seq,
-                            int sequence_length,
-                            float scale,
-                            hipStream_t stream);
-
-// Fused bias add with gelu activation
-template <typename T>
-void launch_bias_gelu(T* input,
-                      const T* bias,
-                      int intermediate_size,
-                      int batch_size,
-                      hipStream_t stream);
-template <typename T>
-void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, hipStream_t stream);
-
-template <typename T>
-void launch_bias_residual(T* input,
-                          T* output,
-                          T* attn,
-                          T* bias,
-                          T* attn_bias,
-                          int batch,
-                          int hidden_dim,
-                          int mp_size,
-                          hipStream_t stream);
-
-template <typename T>
-void launch_layer_norm(T* out,
-                       T* vals,
-                       const T* gamma,
-                       const T* beta,
-                       float epsilon,
-                       int batch_size,
-                       int hidden_dim,
-                       hipStream_t stream);
-
-template <typename T>
-void launch_residual_layer_norm(T* norm,
-                                T* res_add,
-                                T* vals,
-                                T* residual,
-                                const T* bias,
-                                const T* gamma,
-                                const T* beta,
-                                float epsilon,
-                                int batch_size,
-                                int hidden_dim,
-                                bool preLN,
-                                bool mlp_after_attn,
-                                hipStream_t stream);
-template <typename T>
-void launch_dequantize(T* output,
-                       const int8_t* input,
-                       const float* qscale,
-                       unsigned output_size,
-                       unsigned hidden_dim,
-                       unsigned groups,
-                       unsigned merge_count,
-                       hipStream_t stream);
-
-template <typename T>
-void launch_gptj_residual_add(T* input,
-                              T* output,
-                              T* attn,
-                              T* bias,
-                              T* attn_bias,
-                              int batch,
-                              int head_size,
-                              int mp_size,
-                              hipStream_t stream);
-
-template <typename T>
-void launch_apply_rotary_pos_emb(T* mixed_query,
-                                 T* key_layer,
-                                 unsigned head_size,
-                                 unsigned seq_len,
-                                 unsigned rotary_dim,
-                                 unsigned offset,
-                                 unsigned num_heads,
-                                 unsigned batch,
-                                 bool rotate_half,
-                                 bool rotate_every_two,
-                                 hipStream_t stream);
-
-template <typename T>
-void launch_moe_res_matmul(T* residual,
-                           T* coef,
-                           T* mlp_out,
-                           int seq_len,
-                           int hidden_dim,
-                           hipStream_t stream);
diff --git a/deepspeed/ops/csrc/transformer/normalize_kernels.cu b/deepspeed/ops/csrc/transformer/normalize_kernels.cu
deleted file mode 100644
index d634c7f1b2cd1c2632495d8e1f3b47b45867c353..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/normalize_kernels.cu
+++ /dev/null
@@ -1,2121 +0,0 @@
-#include "custom_cuda_layers.h"
-
-namespace cg = cooperative_groups;
-
-/*
-Fused bias add, residual (elementwise) add, and normalization layer.
-
-For FP16, this kernel does not promote to FP32 in order to utilize the 2x throughput for
-__half2 instructions, and avoid the conversion overhead (1/8 of __hal2 arithmetic).
-
-For specific launch constraints, see the launch functions.
-*/
-
-#define NORM_REG (MAX_REGISTERS / 4)
-
-__global__ void fused_bias_residual_layer_norm(float* vals,
-                                               const float* residual,
-                                               const float* gamma,
-                                               const float* beta,
-                                               float epsilon,
-                                               bool preLayerNorm,
-                                               bool training,
-                                               float* vars,
-                                               float* means,
-                                               int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id / WARP_SIZE;
-
-    float vals_arr[NORM_REG];
-    __shared__ float shr[MAX_WARP_NUM];
-
-    residual += (row * row_stride);
-    vals += (row * row_stride);
-
-    float sum = 0.f;
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = residual[i * iteration_stride + id];
-        sum += vals_arr[i];
-    }
-    if (high_index < row_stride) {
-        vals_arr[iterations] = residual[high_index];
-        sum += vals_arr[iterations];
-        iterations++;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
-
-#if !defined(__STOCHASTIC_MODE__) || __CUDA_ARCH__ < 700
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        sum += g.shfl_down(sum, i);
-    }
-
-    sum = g.shfl(sum, 0);
-    float mean = sum / row_stride;
-    if (training)
-        if (threadIdx.x == 0) means[row] = mean;
-    float variance = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] -= mean;
-        variance += vals_arr[i] * vals_arr[i];
-    }
-
-    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = variance;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        variance += g.shfl_down(variance, i);
-    }
-    variance = g.shfl(variance, 0);
-    variance /= row_stride;
-    variance += epsilon;
-    if (training)
-        if (threadIdx.x == 0) vars[row] = variance;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = vals_arr[i] * rsqrtf(variance);
-        vals_arr[i] =
-            vals_arr[i] * gamma[i * iteration_stride + id] + beta[i * iteration_stride + id];
-        vals[i * iteration_stride + id] = vals_arr[i];
-    }
-    if ((high_index) < row_stride) {
-        vals_arr[iterations] = vals_arr[iterations] * rsqrtf(variance);
-        vals_arr[iterations] = vals_arr[iterations] * gamma[high_index] + beta[high_index];
-        vals[high_index] = vals_arr[iterations];
-    }
-}
-
-__global__ void fused_bias_residual_layer_norm(__half* vals,
-                                               const __half* residual,
-                                               const __half* gamma,
-                                               const __half* beta,
-                                               float epsilon,
-                                               bool preLayerNorm,
-                                               bool training,
-                                               __half* vars,
-                                               __half* means,
-                                               int row_stride)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> WARP_SIZE_BITS;
-
-    float2 vals_f[NORM_REG];
-    __shared__ float shr[MAX_WARP_NUM];
-
-    __half2* vals_cast = reinterpret_cast<__half2*>(vals);
-    const __half2* residual_cast = reinterpret_cast<const __half2*>(residual);
-
-    residual_cast += (row * row_stride);
-    vals_cast += (row * row_stride);
-
-    float sum = 0.f;
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        vals_f[i] = __half22float2(residual_cast[i * iteration_stride + id]);
-        sum += vals_f[i].x;
-        sum += vals_f[i].y;
-    }
-    if ((high_index) < row_stride) {
-        vals_f[iterations] = __half22float2(residual_cast[high_index]);
-        sum += vals_f[iterations].x;
-        sum += vals_f[iterations].y;
-        iterations++;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        sum += g.shfl_down(sum, i);
-    }
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride * 2);
-
-    float variance = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        vals_f[i].x -= mean;
-        vals_f[i].y -= mean;
-        variance += vals_f[i].x * vals_f[i].x;
-        variance += vals_f[i].y * vals_f[i].y;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = variance;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        variance += g.shfl_down(variance, i);
-    }
-    variance = g.shfl(variance, 0);
-    variance /= (row_stride * 2);
-    variance += epsilon;
-
-    __half2 variance_h = __float2half2_rn(variance);
-    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
-    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
-
-    if (training && threadIdx.x == 0) {
-        vars[row] = __float2half(variance);
-        means[row] = __float2half(mean);
-    }
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        __half2 vals_arr = __float22half2_rn(vals_f[i]);
-        vals_arr = vals_arr * h2rsqrt(variance_h);
-        vals_arr =
-            vals_arr * gamma_cast[i * iteration_stride + id] + beta_cast[i * iteration_stride + id];
-        vals_cast[i * iteration_stride + id] = vals_arr;
-    }
-    if ((high_index) < row_stride) {
-        __half2 vals_arr = __float22half2_rn(vals_f[iterations]);
-        vals_arr = vals_arr * h2rsqrt(variance_h);
-        vals_arr = vals_arr * gamma_cast[high_index] + beta_cast[high_index];
-        vals_cast[high_index] = vals_arr;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_residual_layer_norm(T* vals,
-                                     const T* residual,
-                                     const T* gamma,
-                                     const T* beta,
-                                     float epsilon,
-                                     int batch_size,
-                                     int hidden_dim,
-                                     cudaStream_t stream,
-                                     bool preLayerNorm,
-                                     bool training,
-                                     T* vars,
-                                     T* means);
-
-template <>
-void launch_bias_residual_layer_norm<float>(float* vals,
-                                            const float* residual,
-                                            const float* gamma,
-                                            const float* beta,
-                                            float epsilon,
-                                            int batch_size,
-                                            int hidden_dim,
-                                            cudaStream_t stream,
-                                            bool preLayerNorm,
-                                            bool training,
-                                            float* vars,
-                                            float* means)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(batch_size);
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim(threads);
-
-    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
-        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, hidden_dim);
-}
-
-template <>
-void launch_bias_residual_layer_norm<__half>(__half* vals,
-                                             const __half* residual,
-                                             const __half* gamma,
-                                             const __half* beta,
-                                             float epsilon,
-                                             int batch_size,
-                                             int hidden_dim,
-                                             cudaStream_t stream,
-                                             bool preLayerNorm,
-                                             bool training,
-                                             __half* vars,
-                                             __half* means)
-{
-    int threads = 128;
-
-    dim3 grid_dim(batch_size);
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim(threads);
-
-    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
-        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, hidden_dim / 2);
-}
-
-__global__ void fused_bias_residual_layer_norm(float* vals,
-                                               const float* residual,
-                                               const float* gamma,
-                                               const float* beta,
-                                               float epsilon,
-                                               bool preLayerNorm,
-                                               bool training,
-                                               float* vars,
-                                               int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id / 32;
-
-    float vals_arr[NORM_REG];
-    __shared__ float shr[MAX_WARP_NUM];
-
-    residual += (row * row_stride);
-    vals += (row * row_stride);
-
-    float sum = 0.f;
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = residual[i * iteration_stride + id];
-        sum += vals_arr[i];
-    }
-    if ((high_index) < row_stride) {
-        vals_arr[iterations] = residual[high_index];
-        sum += vals_arr[iterations];
-        iterations++;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
-
-#if !defined(__STOCHASTIC_MODE__) || __CUDA_ARCH__ < 700
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        sum += g.shfl_down(sum, i);
-    }
-
-    sum = g.shfl(sum, 0);
-    float mean = sum / row_stride;
-    float variance = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] -= mean;
-        variance += vals_arr[i] * vals_arr[i];
-    }
-
-    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = variance;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        variance += g.shfl_down(variance, i);
-    }
-    variance = g.shfl(variance, 0);
-    variance /= row_stride;
-    variance += epsilon;
-    if (training)
-        if (threadIdx.x == 0) vars[row] = variance;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = vals_arr[i] * rsqrtf(variance);
-        vals_arr[i] =
-            vals_arr[i] * gamma[i * iteration_stride + id] + beta[i * iteration_stride + id];
-        vals[i * iteration_stride + id] = vals_arr[i];
-    }
-    if ((high_index) < row_stride) {
-        vals_arr[iterations] = vals_arr[iterations] * rsqrtf(variance);
-        vals_arr[iterations] = vals_arr[iterations] * gamma[high_index] + beta[high_index];
-        vals[high_index] = vals_arr[iterations];
-    }
-}
-
-__global__ void fused_bias_residual_layer_norm(__half* vals,
-                                               const __half* residual,
-                                               const __half* gamma,
-                                               const __half* beta,
-                                               float epsilon,
-                                               bool preLayerNorm,
-                                               bool training,
-                                               __half* vars,
-                                               int row_stride)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> WARP_SIZE_BITS;
-
-    float2 vals_f[NORM_REG];
-    __shared__ float shr[MAX_WARP_NUM];
-
-    __half2* vals_cast = reinterpret_cast<__half2*>(vals);
-    const __half2* residual_cast = reinterpret_cast<const __half2*>(residual);
-
-    residual_cast += (row * row_stride);
-    vals_cast += (row * row_stride);
-
-    float sum = 0.f;
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        vals_f[i] = __half22float2(residual_cast[i * iteration_stride + id]);
-        sum += vals_f[i].x;
-        sum += vals_f[i].y;
-    }
-    if ((high_index) < row_stride) {
-        vals_f[iterations] = __half22float2(residual_cast[high_index]);
-        sum += vals_f[iterations].x;
-        sum += vals_f[iterations].y;
-        iterations++;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        sum += g.shfl_down(sum, i);
-    }
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride * 2);
-
-    float variance = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        vals_f[i].x -= mean;
-        vals_f[i].y -= mean;
-        variance += vals_f[i].x * vals_f[i].x;
-        variance += vals_f[i].y * vals_f[i].y;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = variance;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        variance += g.shfl_down(variance, i);
-    }
-    variance = g.shfl(variance, 0);
-    variance /= (row_stride * 2);
-    variance += epsilon;
-
-    __half2 variance_h = __float2half2_rn(variance);
-    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
-    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
-
-    if (training && threadIdx.x == 0) vars[row] = __float2half(variance);
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        __half2 vals_arr = __float22half2_rn(vals_f[i]);
-        vals_arr = vals_arr * h2rsqrt(variance_h);
-        vals_arr =
-            vals_arr * gamma_cast[i * iteration_stride + id] + beta_cast[i * iteration_stride + id];
-        vals_cast[i * iteration_stride + id] = vals_arr;
-    }
-    if ((high_index) < row_stride) {
-        __half2 vals_arr = __float22half2_rn(vals_f[iterations]);
-        vals_arr = vals_arr * h2rsqrt(variance_h);
-        vals_arr = vals_arr * gamma_cast[high_index] + beta_cast[high_index];
-        vals_cast[high_index] = vals_arr;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_residual_layer_norm(T* vals,
-                                     const T* residual,
-                                     const T* gamma,
-                                     const T* beta,
-                                     float epsilon,
-                                     int batch_size,
-                                     int hidden_dim,
-                                     cudaStream_t stream,
-                                     bool preLayerNorm,
-                                     bool training,
-                                     T* vars);
-
-/*
-To tune this launch the following restrictions must be met:
-
-For float:
-row_stride == hidden_size
-threads * iterations == row_stride
-threads is in [32, 64, 128, 256, 512, 1024]
-
-For half:
-row_stride == hidden_size / 2
-threads * iterations == row_stride
-threads is in [32, 64, 128, 256, 512, 1024]
-
-*/
-
-template <>
-void launch_bias_residual_layer_norm<float>(float* vals,
-                                            const float* residual,
-                                            const float* gamma,
-                                            const float* beta,
-                                            float epsilon,
-                                            int batch_size,
-                                            int hidden_dim,
-                                            cudaStream_t stream,
-                                            bool preLayerNorm,
-                                            bool training,
-                                            float* vars)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(batch_size);
-
-    // There are some limitations to call below functions, now just enumerate the situations.
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim(threads);
-
-    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
-        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, hidden_dim);
-}
-
-template <>
-void launch_bias_residual_layer_norm<__half>(__half* vals,
-                                             const __half* residual,
-                                             const __half* gamma,
-                                             const __half* beta,
-                                             float epsilon,
-                                             int batch_size,
-                                             int hidden_dim,
-                                             cudaStream_t stream,
-                                             bool preLayerNorm,
-                                             bool training,
-                                             __half* vars)
-{
-    int threads = 128;
-
-    dim3 grid_dim(batch_size);
-
-    // There are some limitations to call below functions, now just enumerate the situations.
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim(threads);
-    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
-        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, hidden_dim / 2);
-}
-
-/* Normalize Gamma & Betta gradients
- * Compute gradients using either X_hat or
- * normalize input (invertible).
- * Combine transpose with gradients computation.
- */
-
-template <typename T>
-__global__ void LayerNormBackward1(const T* __restrict__ out_grad,
-                                   const T* __restrict__ vals_hat,
-                                   const T* __restrict__ gamma,
-                                   const T* __restrict__ betta,
-                                   T* __restrict__ gamma_grad,
-                                   T* __restrict__ betta_grad,
-                                   int rows,
-                                   int width,
-                                   bool invertible)
-{
-    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
-    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int offset = threadIdx.y * width + idx;
-    int y_stride = width * TILE_DIM;
-
-    float betta_reg = (invertible ? (float)betta[idx] : 0.0f);
-    float gamma_reg = (float)gamma[idx];
-
-    // Loop across matrix height
-    float betta_tmp = 0;
-    float gamma_tmp = 0;
-    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-        float grad = (float)out_grad[offset];
-        float val = (invertible ? ((float)vals_hat[offset] - betta_reg) / gamma_reg
-                                : (float)vals_hat[offset]);
-        betta_tmp += grad;
-        gamma_tmp += (val * grad);
-
-        offset += y_stride;
-    }
-
-    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
-    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
-    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) {
-        s1 += g.shfl_down(s1, i);
-        s2 += g.shfl_down(s2, i);
-    }
-
-    if (threadIdx.x == 0) {
-        int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-        betta_grad[pos] = s1;
-        gamma_grad[pos] = s2;
-    }
-}
-
-/* Normalize Gamma & Betta gradients
- * Compute gradients using the input to
- * the normalize.
- * Combine transpose with gradients computation.
- */
-
-template <typename T>
-__global__ void LayerNormBackward1(const T* __restrict__ out_grad,
-                                   const T* __restrict__ X_data,
-                                   const T* __restrict__ vars,
-                                   const T* __restrict__ means,
-                                   T* __restrict__ gamma_grad,
-                                   T* __restrict__ betta_grad,
-                                   int rows,
-                                   int width)
-{
-    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
-    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int offset = threadIdx.y * width + idx;
-    int y_stride = width * TILE_DIM;
-
-    int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-    // Loop across matrix height
-
-    float betta_tmp = 0;
-    float gamma_tmp = 0;
-    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-        float grad = (float)out_grad[offset];
-        float val = (float)X_data[offset];
-        val = (val - (float)means[r]) * rsqrtf((float)vars[r]);
-        betta_tmp += grad;
-        gamma_tmp += (val * grad);
-
-        offset += y_stride;
-    }
-
-    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
-    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
-    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) {
-        s1 += g.shfl_down(s1, i);
-        s2 += g.shfl_down(s2, i);
-    }
-
-    if (threadIdx.x == 0) {
-        betta_grad[pos] = s1;
-        gamma_grad[pos] = s2;
-    }
-}
-/*
-
-/* Backward Normalize (Input-Gradient)
- * Using the means and variances from the input
- * This type of backward is invertible!
- * We do the backward using the X_hat (X - u) / sqrt(variance) or the output of Normalization.
- */
-
-__global__ void LayerNormBackward2(const float* out_grad,
-                                   const float* vals_hat,
-                                   const float* gamma,
-                                   const float* betta,
-                                   const float* vars,
-                                   float* inp_grad,
-                                   bool invertible,
-                                   int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    out_grad += (row * row_stride);
-    vals_hat += (row * row_stride);
-    inp_grad += (row * row_stride);
-
-    float vals_arr[NORM_REG];
-    float vals_hat_arr[NORM_REG];
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        float gamma_reg = gamma[i * iteration_stride + id];
-        vals_arr[i] = out_grad[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;
-        vals_hat_arr[i] =
-            (invertible ? (vals_hat[i * iteration_stride + id] - betta[i * iteration_stride + id]) /
-                              gamma_reg
-                        : vals_hat[i * iteration_stride + id]);
-    }
-    if ((high_index) < row_stride) {
-        float gamma_reg = gamma[high_index];
-        vals_arr[iterations] = out_grad[high_index];
-        vals_arr[iterations] *= gamma_reg;
-        vals_hat_arr[iterations] =
-            (invertible ? (vals_hat[high_index] - betta[high_index]) / gamma_reg
-                        : vals_hat[high_index]);
-        iterations++;
-    }
-
-    float var_reg = vars[row];
-
-    float sum = 0;
-    for (int i = 0; i < iterations; i++) {
-        sum += vals_hat_arr[i] * vals_arr[i] *
-               sqrtf(var_reg);           // dval_hat = gamma * (x - u) * out_grad
-        vals_arr[i] *= rsqrtf(var_reg);  // dvar_inv = gamma * out_grad / sqrt(var)
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    for (int i = 0; i < iterations; i++) { vals_arr[i] += ((-sum * vals_hat_arr[i]) / var_reg); }
-
-    sum = 0;
-    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) inp_grad[i * iteration_stride + id] = (vals_arr[i] - sum);
-    if ((high_index) < row_stride) inp_grad[high_index] = (vals_arr[iterations] - sum);
-}
-
-__global__ void LayerNormBackward2(const __half* out_grad,
-                                   const __half* vals_hat,
-                                   const __half* gamma,
-                                   const __half* betta,
-                                   const __half* vars,
-                                   __half* inp_grad,
-                                   bool invertible,
-                                   int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    __half2 vals_arr[NORM_REG];
-    float2 vals_arr_f[NORM_REG];
-    __half2 vals_hat_arr[NORM_REG];
-
-    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
-    const __half2* out_grad_h = reinterpret_cast<const __half2*>(out_grad);
-    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(vals_hat);
-
-    inp_grad_h += (row * row_stride);
-    out_grad_h += (row * row_stride);
-    vals_hat_h += (row * row_stride);
-
-    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
-    const __half2* betta_h = (invertible ? reinterpret_cast<const __half2*>(betta) : nullptr);
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
-        vals_arr[i] = out_grad_h[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;
-        vals_hat_arr[i] =
-            (invertible
-                 ? (vals_hat_h[i * iteration_stride + id] - betta_h[i * iteration_stride + id]) /
-                       gamma_reg
-                 : vals_hat_h[i * iteration_stride + id]);
-    }
-    if ((high_index) < row_stride) {
-        __half2 gamma_reg = gamma_h[high_index];
-        vals_arr[iterations] = out_grad_h[high_index];
-        vals_arr[iterations] *= gamma_reg;
-        vals_hat_arr[iterations] =
-            (invertible ? (vals_hat_h[high_index] - betta_h[high_index]) / gamma_reg
-                        : vals_hat_h[high_index]);
-        iterations++;
-    }
-    __half var_h = vars[row];
-    __half2 var_reg = __halves2half2(var_h, var_h);
-
-    float sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        __half2 result_h = (vals_hat_arr[i] * vals_arr[i] * h2sqrt(var_reg));
-        float2 result_f = __half22float2(result_h);
-        sum += result_f.x;
-        sum += result_f.y;
-        vals_arr[i] *= h2rsqrt(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-    __half2 sum_h = __float2half2_rn(sum);
-
-    for (int i = 0; i < iterations; i++) {
-        __half2 temp = ((-sum_h * vals_hat_arr[i]) / (var_reg));
-        vals_arr_f[i] = __half22float2(vals_arr[i]);
-        float2 temp_f = __half22float2(temp);
-        vals_arr_f[i].x += temp_f.x;
-        vals_arr_f[i].y += temp_f.y;
-    }
-    sum = 0.f;
-
-    for (int i = 0; i < iterations; i++) {
-        sum += (vals_arr_f[i].x);
-        sum += (vals_arr_f[i].y);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr_f[i].x -= sum;
-        vals_arr_f[i].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[i]);
-
-        inp_grad_h[i * iteration_stride + id] = temp;
-    }
-    if ((high_index) < row_stride) {
-        vals_arr_f[iterations].x -= sum;
-        vals_arr_f[iterations].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
-
-        inp_grad_h[high_index] = temp;
-    }
-}
-
-template <>
-void launch_layerNorm_backward<float>(const float* out_grad,
-                                      const float* vals_hat,
-                                      const float* vars,
-                                      const float* gamma,
-                                      float* gamma_grad,
-                                      float* betta_grad,
-                                      float* inp_grad,
-                                      int batch,
-                                      int hidden_dim,
-                                      cudaStream_t stream[2],
-                                      bool invertible,
-                                      const float* betta)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    LayerNormBackward1<float><<<grid_dim, block_dim, 0, stream[0]>>>(
-        out_grad, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads);
-
-    LayerNormBackward2<<<grid_dim2, block_dim2, 0, stream[1]>>>(
-        out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim);
-}
-
-template <>
-void launch_layerNorm_backward<__half>(const __half* out_grad,
-                                       const __half* vals_hat,
-                                       const __half* vars,
-                                       const __half* gamma,
-                                       __half* gamma_grad,
-                                       __half* betta_grad,
-                                       __half* inp_grad,
-                                       int batch,
-                                       int hidden_dim,
-                                       cudaStream_t stream[2],
-                                       bool invertible,
-                                       const __half* betta)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    // LayerNormBackward1<__half><<<grid_dim, block_dim, 0, stream[0]>>>(
-    //    out_grad, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads / 2);
-
-    LayerNormBackward2<<<grid_dim2, block_dim2, 0, stream[1]>>>(
-        out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim / 2);
-}
-
-/* Backward Normalize (Input-Gradient)
- * Using the means and variances from the input
- * This type of backward is not invertible!
- * We do the backward using the input (X)
- */
-
-__global__ void LayerNormBackward2(const float* out_grad,
-                                   const float* X_vals,
-                                   const float* gamma,
-                                   const float* vars,
-                                   const float* means,
-                                   float* inp_grad,
-                                   int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id >> WARP_SIZE_BITS;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    out_grad += (row * row_stride);
-    X_vals += (row * row_stride);
-    inp_grad += (row * row_stride);
-
-    float vals_arr[NORM_REG];
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        float gamma_reg = gamma[i * iteration_stride + id];
-        vals_arr[i] = out_grad[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;
-    }
-    if ((high_index) < row_stride) {
-        float gamma_reg = gamma[high_index];
-        vals_arr[iterations] = out_grad[high_index];
-        vals_arr[iterations] *= gamma_reg;
-        iterations++;
-    }
-
-    float var_reg = vars[row];
-    float mean_reg = means[row];
-
-    float sum = 0;
-    float xu[NORM_REG];
-    for (int i = 0; i < iterations; i++) {
-        xu[i] = (X_vals[i * iteration_stride + id] - mean_reg);
-        sum += vals_arr[i] * xu[i];
-        vals_arr[i] *= rsqrtf(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] += (-sum * xu[i] * rsqrtf(var_reg) / (var_reg));
-    }
-
-    sum = 0;
-    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) inp_grad[i * iteration_stride + id] = (vals_arr[i] - sum);
-    if ((high_index) < row_stride) inp_grad[high_index] = (vals_arr[iterations] - sum);
-}
-
-__global__ void LayerNormBackward2(const __half* out_grad,
-                                   const __half* X_vals,
-                                   const __half* gamma,
-                                   const __half* vars,
-                                   const __half* means,
-                                   __half* inp_grad,
-                                   int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id >> WARP_SIZE_BITS;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    __half2 vals_arr[NORM_REG];
-    float2 vals_arr_f[NORM_REG];
-    __half2 xu[NORM_REG];
-
-    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
-    const __half2* out_grad_h = reinterpret_cast<const __half2*>(out_grad);
-    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(X_vals);
-
-    inp_grad_h += (row * row_stride);
-    out_grad_h += (row * row_stride);
-    vals_hat_h += (row * row_stride);
-
-    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
-    int high_index = iterations * iteration_stride + id;
-
-    __half mean_h = means[row];
-    __half2 mean_reg = __halves2half2(mean_h, mean_h);
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
-        vals_arr[i] = out_grad_h[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;  // out_grad * gamma
-        xu[i] = (vals_hat_h[i * iteration_stride + id] - mean_reg);
-    }
-    if ((high_index) < row_stride) {
-        __half2 gamma_reg = gamma_h[high_index];
-        vals_arr[iterations] = out_grad_h[high_index];
-        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
-        xu[iterations] = (vals_hat_h[high_index] - mean_reg);
-        iterations++;
-    }
-    __half var_h = vars[row];
-    __half2 var_reg = __halves2half2(var_h, var_h);
-
-    float sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        __half2 result_h = (xu[i] * vals_arr[i]);
-        float2 result_f = __half22float2(result_h);
-        sum += result_f.x;
-        sum += result_f.y;
-        vals_arr[i] *= h2rsqrt(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-    __half2 sum_h = __float2half2_rn(sum);
-
-    for (int i = 0; i < iterations; i++) {
-        __half2 xu_grad = ((-sum_h * xu[i] * h2rsqrt(var_reg)) / (var_reg));
-        vals_arr_f[i] = __half22float2(vals_arr[i]);
-        float2 xu_grad_f = __half22float2(xu_grad);
-        vals_arr_f[i].x += xu_grad_f.x;
-        vals_arr_f[i].y += xu_grad_f.y;
-    }
-
-    sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        sum += (vals_arr_f[i].x);
-        sum += (vals_arr_f[i].y);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr_f[i].x -= sum;
-        vals_arr_f[i].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[i]);
-        inp_grad_h[i * iteration_stride + id] = temp;
-    }
-    if ((high_index) < row_stride) {
-        vals_arr_f[iterations].x -= sum;
-        vals_arr_f[iterations].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
-        inp_grad_h[high_index] = temp;
-    }
-}
-
-template <>
-void launch_layerNorm_backward<float>(const float* out_grad,
-                                      const float* X_data,
-                                      const float* vars,
-                                      const float* means,
-                                      const float* gamma,
-                                      float* gamma_grad,
-                                      float* betta_grad,
-                                      float* inp_grad,
-                                      int batch,
-                                      int hidden_dim,
-                                      cudaStream_t stream[2])
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    LayerNormBackward1<float><<<grid_dim, block_dim, 0, stream[0]>>>(
-        out_grad, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads);
-    LayerNormBackward2<<<grid_dim2, block_dim2, 0, stream[1]>>>(
-        out_grad, X_data, gamma, vars, means, inp_grad, hidden_dim);
-}
-
-template <>
-void launch_layerNorm_backward<__half>(const __half* out_grad,
-                                       const __half* X_data,
-                                       const __half* vars,
-                                       const __half* means,
-                                       const __half* gamma,
-                                       __half* gamma_grad,
-                                       __half* betta_grad,
-                                       __half* inp_grad,
-                                       int batch,
-                                       int hidden_dim,
-                                       cudaStream_t stream[2])
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    LayerNormBackward1<__half><<<grid_dim, block_dim, 0, stream[0]>>>(
-        out_grad, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads / 2);
-    LayerNormBackward2<<<grid_dim2, block_dim2, 0, stream[1]>>>(
-        out_grad, X_data, gamma, vars, means, inp_grad, hidden_dim / 2);
-}
-
-template <typename T>
-__global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
-                                             const T* __restrict__ out_grad2,
-                                             const T* __restrict__ vals_hat,
-                                             const T* __restrict__ gamma,
-                                             const T* __restrict__ betta,
-                                             T* __restrict__ gamma_grad,
-                                             T* __restrict__ betta_grad,
-                                             int rows,
-                                             int width,
-                                             bool invertible)
-{
-    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
-    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int offset = threadIdx.y * width + idx;
-    int y_stride = width * TILE_DIM;
-
-    float betta_reg = (invertible ? (float)betta[idx] : 0.0f);
-    float gamma_reg = (float)gamma[idx];
-
-    // Loop across matrix height
-    float betta_tmp = 0;
-    float gamma_tmp = 0;
-    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-        float grad = (float)out_grad1[offset] + (float)out_grad2[offset];
-        float val = (invertible ? ((float)vals_hat[offset] - betta_reg) / gamma_reg
-                                : (float)vals_hat[offset]);
-        betta_tmp += grad;
-        gamma_tmp += (val * grad);
-
-        offset += y_stride;
-    }
-
-    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
-    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
-    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) {
-        s1 += g.shfl_down(s1, i);
-        s2 += g.shfl_down(s2, i);
-    }
-
-    if (threadIdx.x == 0) {
-        int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-        betta_grad[pos] = s1;
-        gamma_grad[pos] = s2;
-    }
-}
-
-template <typename T>
-__global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
-                                             const T* __restrict__ out_grad2,
-                                             const T* __restrict__ X_data,
-                                             const T* __restrict__ vars,
-                                             const T* __restrict__ means,
-                                             T* __restrict__ gamma_grad,
-                                             T* __restrict__ betta_grad,
-                                             int rows,
-                                             int width)
-{
-    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
-    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int offset = threadIdx.y * width + idx;
-    int y_stride = width * TILE_DIM;
-
-    int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-    // Loop across matrix height
-
-    float betta_tmp = 0;
-    float gamma_tmp = 0;
-    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-        float grad = (float)out_grad1[offset] + (float)out_grad2[offset];
-        float val = (float)X_data[offset];
-        val = (val - (float)means[r]) * rsqrtf((float)vars[r]);
-        betta_tmp += grad;
-        gamma_tmp += (val * grad);
-
-        offset += y_stride;
-    }
-
-    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
-    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
-    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) {
-        s1 += g.shfl_down(s1, i);
-        s2 += g.shfl_down(s2, i);
-    }
-
-    if (threadIdx.x == 0) {
-        betta_grad[pos] = s1;
-        gamma_grad[pos] = s2;
-    }
-}
-
-__global__ void LayerNormBackward2_fused_add(const float* out_grad1,
-                                             const float* out_grad2,
-                                             const float* vals_hat,
-                                             const float* gamma,
-                                             const float* betta,
-                                             const float* vars,
-                                             float* inp_grad,
-                                             bool invertible,
-                                             int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    out_grad1 += (row * row_stride);
-    out_grad2 += (row * row_stride);
-    vals_hat += (row * row_stride);
-    inp_grad += (row * row_stride);
-
-    float vals_arr[NORM_REG];
-    float vals_hat_arr[NORM_REG];
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        float gamma_reg = gamma[i * iteration_stride + id];
-        vals_arr[i] = out_grad1[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;
-        vals_hat_arr[i] =
-            (invertible ? (vals_hat[i * iteration_stride + id] - betta[i * iteration_stride + id]) /
-                              gamma_reg
-                        : vals_hat[i * iteration_stride + id]);
-    }
-    if ((high_index) < row_stride) {
-        float gamma_reg = gamma[high_index];
-        vals_arr[iterations] = out_grad1[high_index];
-        vals_arr[iterations] *= gamma_reg;
-        vals_hat_arr[iterations] =
-            (invertible ? (vals_hat[high_index] - betta[high_index]) / gamma_reg
-                        : vals_hat[high_index]);
-        iterations++;
-    }
-
-    float var_reg = vars[row];
-
-    float sum = 0;
-    for (int i = 0; i < iterations; i++) {
-        sum += vals_hat_arr[i] * vals_arr[i] * sqrtf(var_reg);
-        vals_arr[i] *= rsqrtf(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    for (int i = 0; i < iterations; i++) { vals_arr[i] += ((-sum * vals_hat_arr[i]) / var_reg); }
-
-    sum = 0;
-    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++)
-        inp_grad[i * iteration_stride + id] =
-            (vals_arr[i] - sum) + out_grad2[i * iteration_stride + id];
-    if ((high_index) < row_stride)
-        inp_grad[high_index] = (vals_arr[iterations] - sum) + out_grad2[high_index];
-}
-
-__global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
-                                             const __half* out_grad2,
-                                             const __half* vals_hat,
-                                             const __half* gamma,
-                                             const __half* betta,
-                                             const __half* vars,
-                                             __half* inp_grad,
-                                             bool invertible,
-                                             int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    __half2 vals_arr[NORM_REG];
-    float2 vals_arr_f[NORM_REG];
-    __half2 vals_hat_arr[NORM_REG];
-
-    // float2 result[iterations];
-
-    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
-    const __half2* out_grad_h1 = reinterpret_cast<const __half2*>(out_grad1);
-    const __half2* out_grad_h2 = reinterpret_cast<const __half2*>(out_grad2);
-    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(vals_hat);
-
-    inp_grad_h += (row * row_stride);
-    out_grad_h1 += (row * row_stride);
-    out_grad_h2 += (row * row_stride);
-    vals_hat_h += (row * row_stride);
-
-    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
-    const __half2* betta_h = (invertible ? reinterpret_cast<const __half2*>(betta) : nullptr);
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
-        vals_arr[i] = out_grad_h1[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;  // out_grad * gamma
-        vals_hat_arr[i] =
-            (invertible
-                 ? (vals_hat_h[i * iteration_stride + id] - betta_h[i * iteration_stride + id]) /
-                       gamma_reg
-                 : vals_hat_h[i * iteration_stride + id]);
-    }
-    if ((high_index) < row_stride) {
-        __half2 gamma_reg = gamma_h[high_index];
-        vals_arr[iterations] = out_grad_h1[high_index];
-        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
-        vals_hat_arr[iterations] =
-            (invertible ? (vals_hat_h[high_index] - betta_h[high_index]) / gamma_reg
-                        : vals_hat_h[high_index]);
-        iterations++;
-    }
-    __half var_h = vars[row];
-    __half2 var_reg = __halves2half2(var_h, var_h);
-
-    float sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        __half2 result_h = (vals_hat_arr[i] * vals_arr[i] * h2sqrt(var_reg));
-        float2 result_f = __half22float2(result_h);
-        sum += result_f.x;
-        sum += result_f.y;
-        vals_arr[i] *= h2rsqrt(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-    __half2 sum_h = __float2half2_rn(sum);
-
-    for (int i = 0; i < iterations; i++) {
-        __half2 temp = ((-sum_h * vals_hat_arr[i]) / (var_reg));
-        vals_arr_f[i] = __half22float2(vals_arr[i]);
-        float2 temp_f = __half22float2(temp);
-        vals_arr_f[i].x += temp_f.x;
-        vals_arr_f[i].y += temp_f.y;
-    }
-    sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        sum += (vals_arr_f[i].x);
-        sum += (vals_arr_f[i].y);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr_f[i].x -= sum;
-        vals_arr_f[i].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[i]);
-
-        inp_grad_h[i * iteration_stride + id] = temp + out_grad_h2[i * iteration_stride + id];
-    }
-    if ((high_index) < row_stride) {
-        vals_arr_f[iterations].x -= sum;
-        vals_arr_f[iterations].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
-
-        inp_grad_h[high_index] = temp + out_grad_h2[high_index];
-    }
-}
-
-template <>
-void launch_layerNorm_backward_fused_add<float>(const float* out_grad1,
-                                                const float* out_grad2,
-                                                const float* vals_hat,
-                                                const float* vars,
-                                                const float* gamma,
-                                                float* gamma_grad,
-                                                float* betta_grad,
-                                                float* inp_grad,
-                                                int batch,
-                                                int hidden_dim,
-                                                cudaStream_t stream[2],
-                                                bool invertible,
-                                                const float* betta)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-    LayerNormBackward1<float><<<grid_dim, block_dim, 0, stream[0]>>>(
-        out_grad1, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads);
-    LayerNormBackward2_fused_add<<<grid_dim2, block_dim2, 0, stream[1]>>>(
-        out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim);
-}
-
-template <>
-void launch_layerNorm_backward_fused_add<__half>(const __half* out_grad1,
-                                                 const __half* out_grad2,
-                                                 const __half* vals_hat,
-                                                 const __half* vars,
-                                                 const __half* gamma,
-                                                 __half* gamma_grad,
-                                                 __half* betta_grad,
-                                                 __half* inp_grad,
-                                                 int batch,
-                                                 int hidden_dim,
-                                                 cudaStream_t stream[2],
-                                                 bool invertible,
-                                                 const __half* betta)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    LayerNormBackward1<__half><<<grid_dim, block_dim, 0, stream[0]>>>(
-        out_grad1, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads / 2);
-    LayerNormBackward2_fused_add<<<grid_dim2, block_dim2, 0, stream[1]>>>(
-        out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim / 2);
-}
-
-/* Backward Normalize (Input-Gradient)
- * Using the means and variances from the input
- * This type of backward is not invertible!
- * We do the backward using the input (X)
- */
-
-__global__ void LayerNormBackward2_fused_add(const float* out_grad1,
-                                             const float* out_grad2,
-                                             const float* X_vals,
-                                             const float* gamma,
-                                             const float* vars,
-                                             const float* means,
-                                             float* inp_grad,
-                                             int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    float vals_arr[NORM_REG];
-    float vals_hat_arr[NORM_REG];
-
-    out_grad1 += (row * row_stride);
-    out_grad2 += (row * row_stride);
-    X_vals += (row * row_stride);
-    inp_grad += (row * row_stride);
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        float gamma_reg = gamma[i * iteration_stride + id];
-        vals_arr[i] = out_grad1[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;
-        vals_hat_arr[i] = X_vals[i * iteration_stride + id];
-    }
-    if ((high_index) < row_stride) {
-        float gamma_reg = gamma[high_index];
-        vals_arr[iterations] = out_grad1[high_index];
-        vals_arr[iterations] *= gamma_reg;
-        vals_hat_arr[iterations] = X_vals[high_index];
-        iterations++;
-    }
-
-    float var_reg = vars[row];
-    float mean_reg = means[row];
-
-    float sum = 0;
-    float xu[NORM_REG];
-    for (int i = 0; i < iterations; i++) {
-        xu[i] = (vals_hat_arr[i] - mean_reg);
-        sum += vals_arr[i] * xu[i];
-        vals_arr[i] *= rsqrtf(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] += (-sum * xu[i] * rsqrtf(var_reg) / (var_reg));
-    }
-
-    sum = 0;
-    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++)
-        inp_grad[i * iteration_stride + id] =
-            (vals_arr[i] - sum) + out_grad2[i * iteration_stride + id];
-    if ((high_index) < row_stride)
-        inp_grad[high_index] = (vals_arr[iterations] - sum) + out_grad2[high_index];
-}
-
-__global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
-                                             const __half* out_grad2,
-                                             const __half* X_vals,
-                                             const __half* gamma,
-                                             const __half* vars,
-                                             const __half* means,
-                                             __half* inp_grad,
-                                             int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    __half2 vals_arr[NORM_REG];
-    float2 vals_arr_f[NORM_REG];
-    __half2 vals_hat_arr[NORM_REG];
-
-    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
-    const __half2* out_grad_h1 = reinterpret_cast<const __half2*>(out_grad1);
-    const __half2* out_grad_h2 = reinterpret_cast<const __half2*>(out_grad2);
-    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(X_vals);
-
-    out_grad_h1 += (row * row_stride);
-    out_grad_h2 += (row * row_stride);
-    inp_grad_h += (row * row_stride);
-    vals_hat_h += (row * row_stride);
-
-    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
-        vals_arr[i] = out_grad_h1[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;  // out_grad * gamma
-        vals_hat_arr[i] = vals_hat_h[i * iteration_stride + id];
-    }
-    if ((high_index) < row_stride) {
-        __half2 gamma_reg = gamma_h[high_index];
-        vals_arr[iterations] = out_grad_h1[high_index];
-        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
-        vals_hat_arr[iterations] = vals_hat_h[high_index];
-        iterations++;
-    }
-
-    __half mean_h = means[row];
-    __half var_h = vars[row];
-    __half2 var_reg = __halves2half2(var_h, var_h);
-    __half2 mean_reg = __halves2half2(mean_h, mean_h);
-    __half2 xu[NORM_REG];
-
-    float sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        xu[i] = (vals_hat_arr[i] - mean_reg);
-        __half2 result_h = (xu[i] * vals_arr[i]);
-        float2 result_f = __half22float2(result_h);
-        sum += result_f.x;
-        sum += result_f.y;
-        vals_arr[i] *= h2rsqrt(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-    __half2 sum_h = __float2half2_rn(sum);
-
-    for (int i = 0; i < iterations; i++) {
-        __half2 xu_grad = ((-sum_h * xu[i] * h2rsqrt(var_reg)) / (var_reg));
-        vals_arr_f[i] = __half22float2(vals_arr[i]);
-        float2 xu_grad_f = __half22float2(xu_grad);
-        vals_arr_f[i].x += xu_grad_f.x;
-        vals_arr_f[i].y += xu_grad_f.y;
-    }
-
-    sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        sum += (vals_arr_f[i].x);
-        sum += (vals_arr_f[i].y);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr_f[i].x -= sum;
-        vals_arr_f[i].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[i]);
-        inp_grad_h[i * iteration_stride + id] = temp + out_grad_h2[i * iteration_stride + id];
-    }
-    if ((high_index) < row_stride) {
-        vals_arr_f[iterations].x -= sum;
-        vals_arr_f[iterations].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
-        inp_grad_h[high_index] = temp + out_grad_h2[high_index];
-    }
-}
-
-template <>
-void launch_layerNorm_backward_fused_add<float>(const float* out_grad1,
-                                                const float* out_grad2,
-                                                const float* X_data,
-                                                const float* vars,
-                                                const float* means,
-                                                const float* gamma,
-                                                float* gamma_grad,
-                                                float* betta_grad,
-                                                float* inp_grad,
-                                                int batch,
-                                                int hidden_dim,
-                                                cudaStream_t stream[2])
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    LayerNormBackward1<float><<<grid_dim, block_dim, 0, stream[0]>>>(
-        out_grad1, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads);
-    LayerNormBackward2_fused_add<<<grid_dim2, block_dim2, 0, stream[1]>>>(
-        out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad, hidden_dim);
-}
-
-template <>
-void launch_layerNorm_backward_fused_add<__half>(const __half* out_grad1,
-                                                 const __half* out_grad2,
-                                                 const __half* X_data,
-                                                 const __half* vars,
-                                                 const __half* means,
-                                                 const __half* gamma,
-                                                 __half* gamma_grad,
-                                                 __half* betta_grad,
-                                                 __half* inp_grad,
-                                                 int batch,
-                                                 int hidden_dim,
-                                                 cudaStream_t stream[2])
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    LayerNormBackward1<__half><<<grid_dim, block_dim, 0, stream[0]>>>(
-        out_grad1, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads / 2);
-    LayerNormBackward2_fused_add<<<grid_dim2, block_dim2, 0, stream[1]>>>(
-        out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad, hidden_dim / 2);
-}
diff --git a/deepspeed/ops/csrc/transformer/normalize_kernels.hip b/deepspeed/ops/csrc/transformer/normalize_kernels.hip
deleted file mode 100644
index 3d1b17c8f779f0940593a66fea8c07bba6c5534c..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/normalize_kernels.hip
+++ /dev/null
@@ -1,2123 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-
-namespace cg = cooperative_groups;
-
-/*
-Fused bias add, residual (elementwise) add, and normalization layer.
-
-For FP16, this kernel does not promote to FP32 in order to utilize the 2x throughput for
-__half2 instructions, and avoid the conversion overhead (1/8 of __hal2 arithmetic).
-
-For specific launch constraints, see the launch functions.
-*/
-
-#define NORM_REG (MAX_REGISTERS / 4)
-
-__global__ void fused_bias_residual_layer_norm(float* vals,
-                                               const float* residual,
-                                               const float* gamma,
-                                               const float* beta,
-                                               float epsilon,
-                                               bool preLayerNorm,
-                                               bool training,
-                                               float* vars,
-                                               float* means,
-                                               int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id / WARP_SIZE;
-
-    float vals_arr[NORM_REG];
-    __shared__ float shr[MAX_WARP_NUM];
-
-    residual += (row * row_stride);
-    vals += (row * row_stride);
-
-    float sum = 0.f;
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = residual[i * iteration_stride + id];
-        sum += vals_arr[i];
-    }
-    if (high_index < row_stride) {
-        vals_arr[iterations] = residual[high_index];
-        sum += vals_arr[iterations];
-        iterations++;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
-
-#if !defined(__STOCHASTIC_MODE__) || __CUDA_ARCH__ < 700
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        sum += g.shfl_down(sum, i);
-    }
-
-    sum = g.shfl(sum, 0);
-    float mean = sum / row_stride;
-    if (training)
-        if (threadIdx.x == 0) means[row] = mean;
-    float variance = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] -= mean;
-        variance += vals_arr[i] * vals_arr[i];
-    }
-
-    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = variance;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        variance += g.shfl_down(variance, i);
-    }
-    variance = g.shfl(variance, 0);
-    variance /= row_stride;
-    variance += epsilon;
-    if (training)
-        if (threadIdx.x == 0) vars[row] = variance;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = vals_arr[i] * rsqrtf(variance);
-        vals_arr[i] =
-            vals_arr[i] * gamma[i * iteration_stride + id] + beta[i * iteration_stride + id];
-        vals[i * iteration_stride + id] = vals_arr[i];
-    }
-    if ((high_index) < row_stride) {
-        vals_arr[iterations] = vals_arr[iterations] * rsqrtf(variance);
-        vals_arr[iterations] = vals_arr[iterations] * gamma[high_index] + beta[high_index];
-        vals[high_index] = vals_arr[iterations];
-    }
-}
-
-__global__ void fused_bias_residual_layer_norm(__half* vals,
-                                               const __half* residual,
-                                               const __half* gamma,
-                                               const __half* beta,
-                                               float epsilon,
-                                               bool preLayerNorm,
-                                               bool training,
-                                               __half* vars,
-                                               __half* means,
-                                               int row_stride)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> WARP_SIZE_BITS;
-
-    float2 vals_f[NORM_REG];
-    __shared__ float shr[MAX_WARP_NUM];
-
-    __half2* vals_cast = reinterpret_cast<__half2*>(vals);
-    const __half2* residual_cast = reinterpret_cast<const __half2*>(residual);
-
-    residual_cast += (row * row_stride);
-    vals_cast += (row * row_stride);
-
-    float sum = 0.f;
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        vals_f[i] = __half22float2(residual_cast[i * iteration_stride + id]);
-        sum += vals_f[i].x;
-        sum += vals_f[i].y;
-    }
-    if ((high_index) < row_stride) {
-        vals_f[iterations] = __half22float2(residual_cast[high_index]);
-        sum += vals_f[iterations].x;
-        sum += vals_f[iterations].y;
-        iterations++;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        sum += g.shfl_down(sum, i);
-    }
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride * 2);
-
-    float variance = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        vals_f[i].x -= mean;
-        vals_f[i].y -= mean;
-        variance += vals_f[i].x * vals_f[i].x;
-        variance += vals_f[i].y * vals_f[i].y;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = variance;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        variance += g.shfl_down(variance, i);
-    }
-    variance = g.shfl(variance, 0);
-    variance /= (row_stride * 2);
-    variance += epsilon;
-
-    __half2 variance_h = __float2half2_rn(variance);
-    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
-    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
-
-    if (training && threadIdx.x == 0) {
-        vars[row] = __float2half(variance);
-        means[row] = __float2half(mean);
-    }
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        __half2 vals_arr = __float22half2_rn(vals_f[i]);
-        vals_arr = vals_arr * h2rsqrt(variance_h);
-        vals_arr =
-            vals_arr * gamma_cast[i * iteration_stride + id] + beta_cast[i * iteration_stride + id];
-        vals_cast[i * iteration_stride + id] = vals_arr;
-    }
-    if ((high_index) < row_stride) {
-        __half2 vals_arr = __float22half2_rn(vals_f[iterations]);
-        vals_arr = vals_arr * h2rsqrt(variance_h);
-        vals_arr = vals_arr * gamma_cast[high_index] + beta_cast[high_index];
-        vals_cast[high_index] = vals_arr;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_residual_layer_norm(T* vals,
-                                     const T* residual,
-                                     const T* gamma,
-                                     const T* beta,
-                                     float epsilon,
-                                     int batch_size,
-                                     int hidden_dim,
-                                     hipStream_t stream,
-                                     bool preLayerNorm,
-                                     bool training,
-                                     T* vars,
-                                     T* means);
-
-template <>
-void launch_bias_residual_layer_norm<float>(float* vals,
-                                            const float* residual,
-                                            const float* gamma,
-                                            const float* beta,
-                                            float epsilon,
-                                            int batch_size,
-                                            int hidden_dim,
-                                            hipStream_t stream,
-                                            bool preLayerNorm,
-                                            bool training,
-                                            float* vars,
-                                            float* means)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(batch_size);
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim(threads);
-
-   hipLaunchKernelGGL(( fused_bias_residual_layer_norm), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, hidden_dim);
-}
-
-template <>
-void launch_bias_residual_layer_norm<__half>(__half* vals,
-                                             const __half* residual,
-                                             const __half* gamma,
-                                             const __half* beta,
-                                             float epsilon,
-                                             int batch_size,
-                                             int hidden_dim,
-                                             hipStream_t stream,
-                                             bool preLayerNorm,
-                                             bool training,
-                                             __half* vars,
-                                             __half* means)
-{
-    int threads = 128;
-
-    dim3 grid_dim(batch_size);
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim(threads);
-
-   hipLaunchKernelGGL(( fused_bias_residual_layer_norm), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, hidden_dim / 2);
-}
-
-__global__ void fused_bias_residual_layer_norm(float* vals,
-                                               const float* residual,
-                                               const float* gamma,
-                                               const float* beta,
-                                               float epsilon,
-                                               bool preLayerNorm,
-                                               bool training,
-                                               float* vars,
-                                               int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id / 32;
-
-    float vals_arr[NORM_REG];
-    __shared__ float shr[MAX_WARP_NUM];
-
-    residual += (row * row_stride);
-    vals += (row * row_stride);
-
-    float sum = 0.f;
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = residual[i * iteration_stride + id];
-        sum += vals_arr[i];
-    }
-    if ((high_index) < row_stride) {
-        vals_arr[iterations] = residual[high_index];
-        sum += vals_arr[iterations];
-        iterations++;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
-
-#if !defined(__STOCHASTIC_MODE__) || __CUDA_ARCH__ < 700
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        sum += g.shfl_down(sum, i);
-    }
-
-    sum = g.shfl(sum, 0);
-    float mean = sum / row_stride;
-    float variance = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] -= mean;
-        variance += vals_arr[i] * vals_arr[i];
-    }
-
-    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = variance;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        variance += g.shfl_down(variance, i);
-    }
-    variance = g.shfl(variance, 0);
-    variance /= row_stride;
-    variance += epsilon;
-    if (training)
-        if (threadIdx.x == 0) vars[row] = variance;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = vals_arr[i] * rsqrtf(variance);
-        vals_arr[i] =
-            vals_arr[i] * gamma[i * iteration_stride + id] + beta[i * iteration_stride + id];
-        vals[i * iteration_stride + id] = vals_arr[i];
-    }
-    if ((high_index) < row_stride) {
-        vals_arr[iterations] = vals_arr[iterations] * rsqrtf(variance);
-        vals_arr[iterations] = vals_arr[iterations] * gamma[high_index] + beta[high_index];
-        vals[high_index] = vals_arr[iterations];
-    }
-}
-
-__global__ void fused_bias_residual_layer_norm(__half* vals,
-                                               const __half* residual,
-                                               const __half* gamma,
-                                               const __half* beta,
-                                               float epsilon,
-                                               bool preLayerNorm,
-                                               bool training,
-                                               __half* vars,
-                                               int row_stride)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> WARP_SIZE_BITS;
-
-    float2 vals_f[NORM_REG];
-    __shared__ float shr[MAX_WARP_NUM];
-
-    __half2* vals_cast = reinterpret_cast<__half2*>(vals);
-    const __half2* residual_cast = reinterpret_cast<const __half2*>(residual);
-
-    residual_cast += (row * row_stride);
-    vals_cast += (row * row_stride);
-
-    float sum = 0.f;
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        vals_f[i] = __half22float2(residual_cast[i * iteration_stride + id]);
-        sum += vals_f[i].x;
-        sum += vals_f[i].y;
-    }
-    if ((high_index) < row_stride) {
-        vals_f[iterations] = __half22float2(residual_cast[high_index]);
-        sum += vals_f[iterations].x;
-        sum += vals_f[iterations].y;
-        iterations++;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        sum += g.shfl_down(sum, i);
-    }
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride * 2);
-
-    float variance = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        vals_f[i].x -= mean;
-        vals_f[i].y -= mean;
-        variance += vals_f[i].x * vals_f[i].x;
-        variance += vals_f[i].y * vals_f[i].y;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = variance;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        variance += g.shfl_down(variance, i);
-    }
-    variance = g.shfl(variance, 0);
-    variance /= (row_stride * 2);
-    variance += epsilon;
-
-    __half2 variance_h = __float2half2_rn(variance);
-    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
-    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
-
-    if (training && threadIdx.x == 0) vars[row] = __float2half(variance);
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        __half2 vals_arr = __float22half2_rn(vals_f[i]);
-        vals_arr = vals_arr * h2rsqrt(variance_h);
-        vals_arr =
-            vals_arr * gamma_cast[i * iteration_stride + id] + beta_cast[i * iteration_stride + id];
-        vals_cast[i * iteration_stride + id] = vals_arr;
-    }
-    if ((high_index) < row_stride) {
-        __half2 vals_arr = __float22half2_rn(vals_f[iterations]);
-        vals_arr = vals_arr * h2rsqrt(variance_h);
-        vals_arr = vals_arr * gamma_cast[high_index] + beta_cast[high_index];
-        vals_cast[high_index] = vals_arr;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_residual_layer_norm(T* vals,
-                                     const T* residual,
-                                     const T* gamma,
-                                     const T* beta,
-                                     float epsilon,
-                                     int batch_size,
-                                     int hidden_dim,
-                                     hipStream_t stream,
-                                     bool preLayerNorm,
-                                     bool training,
-                                     T* vars);
-
-/*
-To tune this launch the following restrictions must be met:
-
-For float:
-row_stride == hidden_size
-threads * iterations == row_stride
-threads is in [32, 64, 128, 256, 512, 1024]
-
-For half:
-row_stride == hidden_size / 2
-threads * iterations == row_stride
-threads is in [32, 64, 128, 256, 512, 1024]
-
-*/
-
-template <>
-void launch_bias_residual_layer_norm<float>(float* vals,
-                                            const float* residual,
-                                            const float* gamma,
-                                            const float* beta,
-                                            float epsilon,
-                                            int batch_size,
-                                            int hidden_dim,
-                                            hipStream_t stream,
-                                            bool preLayerNorm,
-                                            bool training,
-                                            float* vars)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(batch_size);
-
-    // There are some limitations to call below functions, now just enumerate the situations.
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim(threads);
-
-   hipLaunchKernelGGL(( fused_bias_residual_layer_norm), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, hidden_dim);
-}
-
-template <>
-void launch_bias_residual_layer_norm<__half>(__half* vals,
-                                             const __half* residual,
-                                             const __half* gamma,
-                                             const __half* beta,
-                                             float epsilon,
-                                             int batch_size,
-                                             int hidden_dim,
-                                             hipStream_t stream,
-                                             bool preLayerNorm,
-                                             bool training,
-                                             __half* vars)
-{
-    int threads = 128;
-
-    dim3 grid_dim(batch_size);
-
-    // There are some limitations to call below functions, now just enumerate the situations.
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim(threads);
-   hipLaunchKernelGGL(( fused_bias_residual_layer_norm), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, hidden_dim / 2);
-}
-
-/* Normalize Gamma & Betta gradients
- * Compute gradients using either X_hat or
- * normalize input (invertible).
- * Combine transpose with gradients computation.
- */
-
-template <typename T>
-__global__ void LayerNormBackward1(const T* __restrict__ out_grad,
-                                   const T* __restrict__ vals_hat,
-                                   const T* __restrict__ gamma,
-                                   const T* __restrict__ betta,
-                                   T* __restrict__ gamma_grad,
-                                   T* __restrict__ betta_grad,
-                                   int rows,
-                                   int width,
-                                   bool invertible)
-{
-    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
-    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int offset = threadIdx.y * width + idx;
-    int y_stride = width * TILE_DIM;
-
-    float betta_reg = (invertible ? (float)betta[idx] : 0.0f);
-    float gamma_reg = (float)gamma[idx];
-
-    // Loop across matrix height
-    float betta_tmp = 0;
-    float gamma_tmp = 0;
-    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-        float grad = (float)out_grad[offset];
-        float val = (invertible ? ((float)vals_hat[offset] - betta_reg) / gamma_reg
-                                : (float)vals_hat[offset]);
-        betta_tmp += grad;
-        gamma_tmp += (val * grad);
-
-        offset += y_stride;
-    }
-
-    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
-    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
-    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) {
-        s1 += g.shfl_down(s1, i);
-        s2 += g.shfl_down(s2, i);
-    }
-
-    if (threadIdx.x == 0) {
-        int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-        betta_grad[pos] = s1;
-        gamma_grad[pos] = s2;
-    }
-}
-
-/* Normalize Gamma & Betta gradients
- * Compute gradients using the input to
- * the normalize.
- * Combine transpose with gradients computation.
- */
-
-template <typename T>
-__global__ void LayerNormBackward1(const T* __restrict__ out_grad,
-                                   const T* __restrict__ X_data,
-                                   const T* __restrict__ vars,
-                                   const T* __restrict__ means,
-                                   T* __restrict__ gamma_grad,
-                                   T* __restrict__ betta_grad,
-                                   int rows,
-                                   int width)
-{
-    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
-    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int offset = threadIdx.y * width + idx;
-    int y_stride = width * TILE_DIM;
-
-    int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-    // Loop across matrix height
-
-    float betta_tmp = 0;
-    float gamma_tmp = 0;
-    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-        float grad = (float)out_grad[offset];
-        float val = (float)X_data[offset];
-        val = (val - (float)means[r]) * rsqrtf((float)vars[r]);
-        betta_tmp += grad;
-        gamma_tmp += (val * grad);
-
-        offset += y_stride;
-    }
-
-    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
-    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
-    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) {
-        s1 += g.shfl_down(s1, i);
-        s2 += g.shfl_down(s2, i);
-    }
-
-    if (threadIdx.x == 0) {
-        betta_grad[pos] = s1;
-        gamma_grad[pos] = s2;
-    }
-}
-/*
-
-/* Backward Normalize (Input-Gradient)
- * Using the means and variances from the input
- * This type of backward is invertible!
- * We do the backward using the X_hat (X - u) / sqrt(variance) or the output of Normalization.
- */
-
-__global__ void LayerNormBackward2(const float* out_grad,
-                                   const float* vals_hat,
-                                   const float* gamma,
-                                   const float* betta,
-                                   const float* vars,
-                                   float* inp_grad,
-                                   bool invertible,
-                                   int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    out_grad += (row * row_stride);
-    vals_hat += (row * row_stride);
-    inp_grad += (row * row_stride);
-
-    float vals_arr[NORM_REG];
-    float vals_hat_arr[NORM_REG];
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        float gamma_reg = gamma[i * iteration_stride + id];
-        vals_arr[i] = out_grad[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;
-        vals_hat_arr[i] =
-            (invertible ? (vals_hat[i * iteration_stride + id] - betta[i * iteration_stride + id]) /
-                              gamma_reg
-                        : vals_hat[i * iteration_stride + id]);
-    }
-    if ((high_index) < row_stride) {
-        float gamma_reg = gamma[high_index];
-        vals_arr[iterations] = out_grad[high_index];
-        vals_arr[iterations] *= gamma_reg;
-        vals_hat_arr[iterations] =
-            (invertible ? (vals_hat[high_index] - betta[high_index]) / gamma_reg
-                        : vals_hat[high_index]);
-        iterations++;
-    }
-
-    float var_reg = vars[row];
-
-    float sum = 0;
-    for (int i = 0; i < iterations; i++) {
-        sum += vals_hat_arr[i] * vals_arr[i] *
-               sqrtf(var_reg);           // dval_hat = gamma * (x - u) * out_grad
-        vals_arr[i] *= rsqrtf(var_reg);  // dvar_inv = gamma * out_grad / sqrt(var)
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    for (int i = 0; i < iterations; i++) { vals_arr[i] += ((-sum * vals_hat_arr[i]) / var_reg); }
-
-    sum = 0;
-    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) inp_grad[i * iteration_stride + id] = (vals_arr[i] - sum);
-    if ((high_index) < row_stride) inp_grad[high_index] = (vals_arr[iterations] - sum);
-}
-
-__global__ void LayerNormBackward2(const __half* out_grad,
-                                   const __half* vals_hat,
-                                   const __half* gamma,
-                                   const __half* betta,
-                                   const __half* vars,
-                                   __half* inp_grad,
-                                   bool invertible,
-                                   int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    __half2 vals_arr[NORM_REG];
-    float2 vals_arr_f[NORM_REG];
-    __half2 vals_hat_arr[NORM_REG];
-
-    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
-    const __half2* out_grad_h = reinterpret_cast<const __half2*>(out_grad);
-    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(vals_hat);
-
-    inp_grad_h += (row * row_stride);
-    out_grad_h += (row * row_stride);
-    vals_hat_h += (row * row_stride);
-
-    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
-    const __half2* betta_h = (invertible ? reinterpret_cast<const __half2*>(betta) : nullptr);
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
-        vals_arr[i] = out_grad_h[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;
-        vals_hat_arr[i] =
-            (invertible
-                 ? (vals_hat_h[i * iteration_stride + id] - betta_h[i * iteration_stride + id]) /
-                       gamma_reg
-                 : vals_hat_h[i * iteration_stride + id]);
-    }
-    if ((high_index) < row_stride) {
-        __half2 gamma_reg = gamma_h[high_index];
-        vals_arr[iterations] = out_grad_h[high_index];
-        vals_arr[iterations] *= gamma_reg;
-        vals_hat_arr[iterations] =
-            (invertible ? (vals_hat_h[high_index] - betta_h[high_index]) / gamma_reg
-                        : vals_hat_h[high_index]);
-        iterations++;
-    }
-    __half var_h = vars[row];
-    __half2 var_reg = __halves2half2(var_h, var_h);
-
-    float sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        __half2 result_h = (vals_hat_arr[i] * vals_arr[i] * h2sqrt(var_reg));
-        float2 result_f = __half22float2(result_h);
-        sum += result_f.x;
-        sum += result_f.y;
-        vals_arr[i] *= h2rsqrt(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-    __half2 sum_h = __float2half2_rn(sum);
-
-    for (int i = 0; i < iterations; i++) {
-        __half2 temp = ((-sum_h * vals_hat_arr[i]) / (var_reg));
-        vals_arr_f[i] = __half22float2(vals_arr[i]);
-        float2 temp_f = __half22float2(temp);
-        vals_arr_f[i].x += temp_f.x;
-        vals_arr_f[i].y += temp_f.y;
-    }
-    sum = 0.f;
-
-    for (int i = 0; i < iterations; i++) {
-        sum += (vals_arr_f[i].x);
-        sum += (vals_arr_f[i].y);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr_f[i].x -= sum;
-        vals_arr_f[i].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[i]);
-
-        inp_grad_h[i * iteration_stride + id] = temp;
-    }
-    if ((high_index) < row_stride) {
-        vals_arr_f[iterations].x -= sum;
-        vals_arr_f[iterations].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
-
-        inp_grad_h[high_index] = temp;
-    }
-}
-
-template <>
-void launch_layerNorm_backward<float>(const float* out_grad,
-                                      const float* vals_hat,
-                                      const float* vars,
-                                      const float* gamma,
-                                      float* gamma_grad,
-                                      float* betta_grad,
-                                      float* inp_grad,
-                                      int batch,
-                                      int hidden_dim,
-                                      hipStream_t stream[2],
-                                      bool invertible,
-                                      const float* betta)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-   hipLaunchKernelGGL(( LayerNormBackward1<float>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
-        out_grad, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads);
-
-   hipLaunchKernelGGL(( LayerNormBackward2), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
-        out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim);
-}
-
-template <>
-void launch_layerNorm_backward<__half>(const __half* out_grad,
-                                       const __half* vals_hat,
-                                       const __half* vars,
-                                       const __half* gamma,
-                                       __half* gamma_grad,
-                                       __half* betta_grad,
-                                       __half* inp_grad,
-                                       int batch,
-                                       int hidden_dim,
-                                       hipStream_t stream[2],
-                                       bool invertible,
-                                       const __half* betta)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    //hipLaunchKernelGGL(( LayerNormBackward1<__half>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
-    //    out_grad, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads / 2);
-
-   hipLaunchKernelGGL(( LayerNormBackward2), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
-        out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim / 2);
-}
-
-/* Backward Normalize (Input-Gradient)
- * Using the means and variances from the input
- * This type of backward is not invertible!
- * We do the backward using the input (X)
- */
-
-__global__ void LayerNormBackward2(const float* out_grad,
-                                   const float* X_vals,
-                                   const float* gamma,
-                                   const float* vars,
-                                   const float* means,
-                                   float* inp_grad,
-                                   int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id >> WARP_SIZE_BITS;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    out_grad += (row * row_stride);
-    X_vals += (row * row_stride);
-    inp_grad += (row * row_stride);
-
-    float vals_arr[NORM_REG];
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        float gamma_reg = gamma[i * iteration_stride + id];
-        vals_arr[i] = out_grad[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;
-    }
-    if ((high_index) < row_stride) {
-        float gamma_reg = gamma[high_index];
-        vals_arr[iterations] = out_grad[high_index];
-        vals_arr[iterations] *= gamma_reg;
-        iterations++;
-    }
-
-    float var_reg = vars[row];
-    float mean_reg = means[row];
-
-    float sum = 0;
-    float xu[NORM_REG];
-    for (int i = 0; i < iterations; i++) {
-        xu[i] = (X_vals[i * iteration_stride + id] - mean_reg);
-        sum += vals_arr[i] * xu[i];
-        vals_arr[i] *= rsqrtf(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] += (-sum * xu[i] * rsqrtf(var_reg) / (var_reg));
-    }
-
-    sum = 0;
-    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) inp_grad[i * iteration_stride + id] = (vals_arr[i] - sum);
-    if ((high_index) < row_stride) inp_grad[high_index] = (vals_arr[iterations] - sum);
-}
-
-__global__ void LayerNormBackward2(const __half* out_grad,
-                                   const __half* X_vals,
-                                   const __half* gamma,
-                                   const __half* vars,
-                                   const __half* means,
-                                   __half* inp_grad,
-                                   int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id >> WARP_SIZE_BITS;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    __half2 vals_arr[NORM_REG];
-    float2 vals_arr_f[NORM_REG];
-    __half2 xu[NORM_REG];
-
-    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
-    const __half2* out_grad_h = reinterpret_cast<const __half2*>(out_grad);
-    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(X_vals);
-
-    inp_grad_h += (row * row_stride);
-    out_grad_h += (row * row_stride);
-    vals_hat_h += (row * row_stride);
-
-    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
-    int high_index = iterations * iteration_stride + id;
-
-    __half mean_h = means[row];
-    __half2 mean_reg = __halves2half2(mean_h, mean_h);
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
-        vals_arr[i] = out_grad_h[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;  // out_grad * gamma
-        xu[i] = (vals_hat_h[i * iteration_stride + id] - mean_reg);
-    }
-    if ((high_index) < row_stride) {
-        __half2 gamma_reg = gamma_h[high_index];
-        vals_arr[iterations] = out_grad_h[high_index];
-        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
-        xu[iterations] = (vals_hat_h[high_index] - mean_reg);
-        iterations++;
-    }
-    __half var_h = vars[row];
-    __half2 var_reg = __halves2half2(var_h, var_h);
-
-    float sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        __half2 result_h = (xu[i] * vals_arr[i]);
-        float2 result_f = __half22float2(result_h);
-        sum += result_f.x;
-        sum += result_f.y;
-        vals_arr[i] *= h2rsqrt(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-    __half2 sum_h = __float2half2_rn(sum);
-
-    for (int i = 0; i < iterations; i++) {
-        __half2 xu_grad = ((-sum_h * xu[i] * h2rsqrt(var_reg)) / (var_reg));
-        vals_arr_f[i] = __half22float2(vals_arr[i]);
-        float2 xu_grad_f = __half22float2(xu_grad);
-        vals_arr_f[i].x += xu_grad_f.x;
-        vals_arr_f[i].y += xu_grad_f.y;
-    }
-
-    sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        sum += (vals_arr_f[i].x);
-        sum += (vals_arr_f[i].y);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr_f[i].x -= sum;
-        vals_arr_f[i].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[i]);
-        inp_grad_h[i * iteration_stride + id] = temp;
-    }
-    if ((high_index) < row_stride) {
-        vals_arr_f[iterations].x -= sum;
-        vals_arr_f[iterations].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
-        inp_grad_h[high_index] = temp;
-    }
-}
-
-template <>
-void launch_layerNorm_backward<float>(const float* out_grad,
-                                      const float* X_data,
-                                      const float* vars,
-                                      const float* means,
-                                      const float* gamma,
-                                      float* gamma_grad,
-                                      float* betta_grad,
-                                      float* inp_grad,
-                                      int batch,
-                                      int hidden_dim,
-                                      hipStream_t stream[2])
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-   hipLaunchKernelGGL(( LayerNormBackward1<float>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
-        out_grad, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads);
-   hipLaunchKernelGGL(( LayerNormBackward2), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
-        out_grad, X_data, gamma, vars, means, inp_grad, hidden_dim);
-}
-
-template <>
-void launch_layerNorm_backward<__half>(const __half* out_grad,
-                                       const __half* X_data,
-                                       const __half* vars,
-                                       const __half* means,
-                                       const __half* gamma,
-                                       __half* gamma_grad,
-                                       __half* betta_grad,
-                                       __half* inp_grad,
-                                       int batch,
-                                       int hidden_dim,
-                                       hipStream_t stream[2])
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-   hipLaunchKernelGGL(( LayerNormBackward1<__half>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
-        out_grad, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads / 2);
-   hipLaunchKernelGGL(( LayerNormBackward2), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
-        out_grad, X_data, gamma, vars, means, inp_grad, hidden_dim / 2);
-}
-
-template <typename T>
-__global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
-                                             const T* __restrict__ out_grad2,
-                                             const T* __restrict__ vals_hat,
-                                             const T* __restrict__ gamma,
-                                             const T* __restrict__ betta,
-                                             T* __restrict__ gamma_grad,
-                                             T* __restrict__ betta_grad,
-                                             int rows,
-                                             int width,
-                                             bool invertible)
-{
-    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
-    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int offset = threadIdx.y * width + idx;
-    int y_stride = width * TILE_DIM;
-
-    float betta_reg = (invertible ? (float)betta[idx] : 0.0f);
-    float gamma_reg = (float)gamma[idx];
-
-    // Loop across matrix height
-    float betta_tmp = 0;
-    float gamma_tmp = 0;
-    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-        float grad = (float)out_grad1[offset] + (float)out_grad2[offset];
-        float val = (invertible ? ((float)vals_hat[offset] - betta_reg) / gamma_reg
-                                : (float)vals_hat[offset]);
-        betta_tmp += grad;
-        gamma_tmp += (val * grad);
-
-        offset += y_stride;
-    }
-
-    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
-    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
-    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) {
-        s1 += g.shfl_down(s1, i);
-        s2 += g.shfl_down(s2, i);
-    }
-
-    if (threadIdx.x == 0) {
-        int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-        betta_grad[pos] = s1;
-        gamma_grad[pos] = s2;
-    }
-}
-
-template <typename T>
-__global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
-                                             const T* __restrict__ out_grad2,
-                                             const T* __restrict__ X_data,
-                                             const T* __restrict__ vars,
-                                             const T* __restrict__ means,
-                                             T* __restrict__ gamma_grad,
-                                             T* __restrict__ betta_grad,
-                                             int rows,
-                                             int width)
-{
-    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
-    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int offset = threadIdx.y * width + idx;
-    int y_stride = width * TILE_DIM;
-
-    int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-    // Loop across matrix height
-
-    float betta_tmp = 0;
-    float gamma_tmp = 0;
-    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-        float grad = (float)out_grad1[offset] + (float)out_grad2[offset];
-        float val = (float)X_data[offset];
-        val = (val - (float)means[r]) * rsqrtf((float)vars[r]);
-        betta_tmp += grad;
-        gamma_tmp += (val * grad);
-
-        offset += y_stride;
-    }
-
-    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
-    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
-    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) {
-        s1 += g.shfl_down(s1, i);
-        s2 += g.shfl_down(s2, i);
-    }
-
-    if (threadIdx.x == 0) {
-        betta_grad[pos] = s1;
-        gamma_grad[pos] = s2;
-    }
-}
-
-__global__ void LayerNormBackward2_fused_add(const float* out_grad1,
-                                             const float* out_grad2,
-                                             const float* vals_hat,
-                                             const float* gamma,
-                                             const float* betta,
-                                             const float* vars,
-                                             float* inp_grad,
-                                             bool invertible,
-                                             int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    out_grad1 += (row * row_stride);
-    out_grad2 += (row * row_stride);
-    vals_hat += (row * row_stride);
-    inp_grad += (row * row_stride);
-
-    float vals_arr[NORM_REG];
-    float vals_hat_arr[NORM_REG];
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        float gamma_reg = gamma[i * iteration_stride + id];
-        vals_arr[i] = out_grad1[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;
-        vals_hat_arr[i] =
-            (invertible ? (vals_hat[i * iteration_stride + id] - betta[i * iteration_stride + id]) /
-                              gamma_reg
-                        : vals_hat[i * iteration_stride + id]);
-    }
-    if ((high_index) < row_stride) {
-        float gamma_reg = gamma[high_index];
-        vals_arr[iterations] = out_grad1[high_index];
-        vals_arr[iterations] *= gamma_reg;
-        vals_hat_arr[iterations] =
-            (invertible ? (vals_hat[high_index] - betta[high_index]) / gamma_reg
-                        : vals_hat[high_index]);
-        iterations++;
-    }
-
-    float var_reg = vars[row];
-
-    float sum = 0;
-    for (int i = 0; i < iterations; i++) {
-        sum += vals_hat_arr[i] * vals_arr[i] * sqrtf(var_reg);
-        vals_arr[i] *= rsqrtf(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    for (int i = 0; i < iterations; i++) { vals_arr[i] += ((-sum * vals_hat_arr[i]) / var_reg); }
-
-    sum = 0;
-    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++)
-        inp_grad[i * iteration_stride + id] =
-            (vals_arr[i] - sum) + out_grad2[i * iteration_stride + id];
-    if ((high_index) < row_stride)
-        inp_grad[high_index] = (vals_arr[iterations] - sum) + out_grad2[high_index];
-}
-
-__global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
-                                             const __half* out_grad2,
-                                             const __half* vals_hat,
-                                             const __half* gamma,
-                                             const __half* betta,
-                                             const __half* vars,
-                                             __half* inp_grad,
-                                             bool invertible,
-                                             int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    __half2 vals_arr[NORM_REG];
-    float2 vals_arr_f[NORM_REG];
-    __half2 vals_hat_arr[NORM_REG];
-
-    // float2 result[iterations];
-
-    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
-    const __half2* out_grad_h1 = reinterpret_cast<const __half2*>(out_grad1);
-    const __half2* out_grad_h2 = reinterpret_cast<const __half2*>(out_grad2);
-    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(vals_hat);
-
-    inp_grad_h += (row * row_stride);
-    out_grad_h1 += (row * row_stride);
-    out_grad_h2 += (row * row_stride);
-    vals_hat_h += (row * row_stride);
-
-    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
-    const __half2* betta_h = (invertible ? reinterpret_cast<const __half2*>(betta) : nullptr);
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
-        vals_arr[i] = out_grad_h1[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;  // out_grad * gamma
-        vals_hat_arr[i] =
-            (invertible
-                 ? (vals_hat_h[i * iteration_stride + id] - betta_h[i * iteration_stride + id]) /
-                       gamma_reg
-                 : vals_hat_h[i * iteration_stride + id]);
-    }
-    if ((high_index) < row_stride) {
-        __half2 gamma_reg = gamma_h[high_index];
-        vals_arr[iterations] = out_grad_h1[high_index];
-        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
-        vals_hat_arr[iterations] =
-            (invertible ? (vals_hat_h[high_index] - betta_h[high_index]) / gamma_reg
-                        : vals_hat_h[high_index]);
-        iterations++;
-    }
-    __half var_h = vars[row];
-    __half2 var_reg = __halves2half2(var_h, var_h);
-
-    float sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        __half2 result_h = (vals_hat_arr[i] * vals_arr[i] * h2sqrt(var_reg));
-        float2 result_f = __half22float2(result_h);
-        sum += result_f.x;
-        sum += result_f.y;
-        vals_arr[i] *= h2rsqrt(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-    __half2 sum_h = __float2half2_rn(sum);
-
-    for (int i = 0; i < iterations; i++) {
-        __half2 temp = ((-sum_h * vals_hat_arr[i]) / (var_reg));
-        vals_arr_f[i] = __half22float2(vals_arr[i]);
-        float2 temp_f = __half22float2(temp);
-        vals_arr_f[i].x += temp_f.x;
-        vals_arr_f[i].y += temp_f.y;
-    }
-    sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        sum += (vals_arr_f[i].x);
-        sum += (vals_arr_f[i].y);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr_f[i].x -= sum;
-        vals_arr_f[i].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[i]);
-
-        inp_grad_h[i * iteration_stride + id] = temp + out_grad_h2[i * iteration_stride + id];
-    }
-    if ((high_index) < row_stride) {
-        vals_arr_f[iterations].x -= sum;
-        vals_arr_f[iterations].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
-
-        inp_grad_h[high_index] = temp + out_grad_h2[high_index];
-    }
-}
-
-template <>
-void launch_layerNorm_backward_fused_add<float>(const float* out_grad1,
-                                                const float* out_grad2,
-                                                const float* vals_hat,
-                                                const float* vars,
-                                                const float* gamma,
-                                                float* gamma_grad,
-                                                float* betta_grad,
-                                                float* inp_grad,
-                                                int batch,
-                                                int hidden_dim,
-                                                hipStream_t stream[2],
-                                                bool invertible,
-                                                const float* betta)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-   hipLaunchKernelGGL(( LayerNormBackward1<float>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
-        out_grad1, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads);
-   hipLaunchKernelGGL(( LayerNormBackward2_fused_add), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
-        out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim);
-}
-
-template <>
-void launch_layerNorm_backward_fused_add<__half>(const __half* out_grad1,
-                                                 const __half* out_grad2,
-                                                 const __half* vals_hat,
-                                                 const __half* vars,
-                                                 const __half* gamma,
-                                                 __half* gamma_grad,
-                                                 __half* betta_grad,
-                                                 __half* inp_grad,
-                                                 int batch,
-                                                 int hidden_dim,
-                                                 hipStream_t stream[2],
-                                                 bool invertible,
-                                                 const __half* betta)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-   hipLaunchKernelGGL(( LayerNormBackward1<__half>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
-        out_grad1, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads / 2);
-   hipLaunchKernelGGL(( LayerNormBackward2_fused_add), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
-        out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim / 2);
-}
-
-/* Backward Normalize (Input-Gradient)
- * Using the means and variances from the input
- * This type of backward is not invertible!
- * We do the backward using the input (X)
- */
-
-__global__ void LayerNormBackward2_fused_add(const float* out_grad1,
-                                             const float* out_grad2,
-                                             const float* X_vals,
-                                             const float* gamma,
-                                             const float* vars,
-                                             const float* means,
-                                             float* inp_grad,
-                                             int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    float vals_arr[NORM_REG];
-    float vals_hat_arr[NORM_REG];
-
-    out_grad1 += (row * row_stride);
-    out_grad2 += (row * row_stride);
-    X_vals += (row * row_stride);
-    inp_grad += (row * row_stride);
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        float gamma_reg = gamma[i * iteration_stride + id];
-        vals_arr[i] = out_grad1[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;
-        vals_hat_arr[i] = X_vals[i * iteration_stride + id];
-    }
-    if ((high_index) < row_stride) {
-        float gamma_reg = gamma[high_index];
-        vals_arr[iterations] = out_grad1[high_index];
-        vals_arr[iterations] *= gamma_reg;
-        vals_hat_arr[iterations] = X_vals[high_index];
-        iterations++;
-    }
-
-    float var_reg = vars[row];
-    float mean_reg = means[row];
-
-    float sum = 0;
-    float xu[NORM_REG];
-    for (int i = 0; i < iterations; i++) {
-        xu[i] = (vals_hat_arr[i] - mean_reg);
-        sum += vals_arr[i] * xu[i];
-        vals_arr[i] *= rsqrtf(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] += (-sum * xu[i] * rsqrtf(var_reg) / (var_reg));
-    }
-
-    sum = 0;
-    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++)
-        inp_grad[i * iteration_stride + id] =
-            (vals_arr[i] - sum) + out_grad2[i * iteration_stride + id];
-    if ((high_index) < row_stride)
-        inp_grad[high_index] = (vals_arr[iterations] - sum) + out_grad2[high_index];
-}
-
-__global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
-                                             const __half* out_grad2,
-                                             const __half* X_vals,
-                                             const __half* gamma,
-                                             const __half* vars,
-                                             const __half* means,
-                                             __half* inp_grad,
-                                             int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    __half2 vals_arr[NORM_REG];
-    float2 vals_arr_f[NORM_REG];
-    __half2 vals_hat_arr[NORM_REG];
-
-    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
-    const __half2* out_grad_h1 = reinterpret_cast<const __half2*>(out_grad1);
-    const __half2* out_grad_h2 = reinterpret_cast<const __half2*>(out_grad2);
-    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(X_vals);
-
-    out_grad_h1 += (row * row_stride);
-    out_grad_h2 += (row * row_stride);
-    inp_grad_h += (row * row_stride);
-    vals_hat_h += (row * row_stride);
-
-    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
-        vals_arr[i] = out_grad_h1[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;  // out_grad * gamma
-        vals_hat_arr[i] = vals_hat_h[i * iteration_stride + id];
-    }
-    if ((high_index) < row_stride) {
-        __half2 gamma_reg = gamma_h[high_index];
-        vals_arr[iterations] = out_grad_h1[high_index];
-        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
-        vals_hat_arr[iterations] = vals_hat_h[high_index];
-        iterations++;
-    }
-
-    __half mean_h = means[row];
-    __half var_h = vars[row];
-    __half2 var_reg = __halves2half2(var_h, var_h);
-    __half2 mean_reg = __halves2half2(mean_h, mean_h);
-    __half2 xu[NORM_REG];
-
-    float sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        xu[i] = (vals_hat_arr[i] - mean_reg);
-        __half2 result_h = (xu[i] * vals_arr[i]);
-        float2 result_f = __half22float2(result_h);
-        sum += result_f.x;
-        sum += result_f.y;
-        vals_arr[i] *= h2rsqrt(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-    __half2 sum_h = __float2half2_rn(sum);
-
-    for (int i = 0; i < iterations; i++) {
-        __half2 xu_grad = ((-sum_h * xu[i] * h2rsqrt(var_reg)) / (var_reg));
-        vals_arr_f[i] = __half22float2(vals_arr[i]);
-        float2 xu_grad_f = __half22float2(xu_grad);
-        vals_arr_f[i].x += xu_grad_f.x;
-        vals_arr_f[i].y += xu_grad_f.y;
-    }
-
-    sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        sum += (vals_arr_f[i].x);
-        sum += (vals_arr_f[i].y);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr_f[i].x -= sum;
-        vals_arr_f[i].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[i]);
-        inp_grad_h[i * iteration_stride + id] = temp + out_grad_h2[i * iteration_stride + id];
-    }
-    if ((high_index) < row_stride) {
-        vals_arr_f[iterations].x -= sum;
-        vals_arr_f[iterations].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
-        inp_grad_h[high_index] = temp + out_grad_h2[high_index];
-    }
-}
-
-template <>
-void launch_layerNorm_backward_fused_add<float>(const float* out_grad1,
-                                                const float* out_grad2,
-                                                const float* X_data,
-                                                const float* vars,
-                                                const float* means,
-                                                const float* gamma,
-                                                float* gamma_grad,
-                                                float* betta_grad,
-                                                float* inp_grad,
-                                                int batch,
-                                                int hidden_dim,
-                                                hipStream_t stream[2])
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-   hipLaunchKernelGGL(( LayerNormBackward1<float>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
-        out_grad1, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads);
-   hipLaunchKernelGGL(( LayerNormBackward2_fused_add), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
-        out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad, hidden_dim);
-}
-
-template <>
-void launch_layerNorm_backward_fused_add<__half>(const __half* out_grad1,
-                                                 const __half* out_grad2,
-                                                 const __half* X_data,
-                                                 const __half* vars,
-                                                 const __half* means,
-                                                 const __half* gamma,
-                                                 __half* gamma_grad,
-                                                 __half* betta_grad,
-                                                 __half* inp_grad,
-                                                 int batch,
-                                                 int hidden_dim,
-                                                 hipStream_t stream[2])
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-   hipLaunchKernelGGL(( LayerNormBackward1<__half>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
-        out_grad1, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads / 2);
-   hipLaunchKernelGGL(( LayerNormBackward2_fused_add), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
-        out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad, hidden_dim / 2);
-}
diff --git a/deepspeed/ops/csrc/transformer/softmax_kernels.cu b/deepspeed/ops/csrc/transformer/softmax_kernels.cu
deleted file mode 100644
index 34487cba9bbec060baa0fb79922e691076296476..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/softmax_kernels.cu
+++ /dev/null
@@ -1,595 +0,0 @@
-#include <math.h>
-#include "custom_cuda_layers.h"
-#include "general_kernels.h"
-
-namespace cg = cooperative_groups;
-
-dim3 get_attn_softmax_grid(int batch_size, int heads, int sequence_length, int threads)
-{
-    int seq_length4 = sequence_length / 4;
-    int block_compute_size =
-        (seq_length4 < threads ? (int)pow(2.0, floor(log2((float)(threads / seq_length4)))) : 1);
-    // Note that the Y and Z dimensions are limited to 65535, while X is basically unlimited:
-    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
-    // The batch size is typically relatively small, while the sequence length could potentially be
-    // arbitrarily large. We therefore place the batch size second to avoid hitting the Y limit.
-    unsigned x = heads * sequence_length / block_compute_size;
-    unsigned y = batch_size;
-    return {x, y};
-}
-
-// Fused attention + softmax
-template <int tbSize, int blockStride, int tbSeq>
-__global__ void attn_softmax(float* vals,
-                             const float* attn_mask,
-                             int heads,
-                             int seq_length,
-                             int iterations)
-{
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int warp_num = blockDim.x >> WARP_SIZE_BITS;
-
-    int iteration_stride = blockDim.x;
-    int block_width = blockStride * seq_length;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-
-    int batch = blockIdx.y;
-    int row = blockIdx.x;
-    int max_threads_in_sequence = std::max(seq_length, tbSeq);
-    int seq_lane = threadIdx.x % max_threads_in_sequence;
-
-    int data_offset = batch * (gridDim.x * block_width) + row * block_width +
-                      (threadIdx.x / max_threads_in_sequence) * seq_length;
-    int mask_offset = batch * seq_length;
-
-    int wid = threadIdx.x >> WARP_SIZE_BITS;
-    int lane = threadIdx.x & 0x1f;
-
-    float4* val_cast = reinterpret_cast<float4*>(vals);
-    const float4* attn_mask_cast = reinterpret_cast<const float4*>(attn_mask);
-
-    float4 data[MAX_THREAD_ITERATIONS];
-
-    float max_val = minus_infinity;
-
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + seq_lane;
-        if (data_id < seq_length) {
-            float4 mask = attn_mask_cast[mask_offset + data_id];
-            data[i] = val_cast[data_offset + data_id];
-
-            data[i].x += mask.x;
-            data[i].y += mask.y;
-            data[i].z += mask.z;
-            data[i].w += mask.w;
-
-            max_val = (data[i].x > max_val ? data[i].x : max_val);
-            max_val = (data[i].y > max_val ? data[i].y : max_val);
-            max_val = (data[i].z > max_val ? data[i].z : max_val);
-            max_val = (data[i].w > max_val ? data[i].w : max_val);
-        } else {
-            data[i].x = minus_infinity;
-            data[i].y = minus_infinity;
-            data[i].z = minus_infinity;
-            data[i].w = minus_infinity;
-        }
-    }
-
-    for (int i = 1; i < tbSize; i *= 2) {
-        auto temp = g.shfl_xor(max_val, i);
-        max_val = (temp > max_val ? temp : max_val);
-    }
-
-    if (seq_length > tbSize) {
-        if (lane == 0) partialSum[wid] = max_val;
-        b.sync();
-
-        if (lane < warp_num) max_val = partialSum[lane];
-
-#ifndef __STOCHASTIC_MODE__
-        b.sync();
-#endif
-
-        int iters = warp_num;
-        if (seq_length < iteration_stride)
-            iters = warp_num / (iteration_stride / max_threads_in_sequence);
-
-        for (int i = 1; i < iters; i *= 2) {
-            auto temp = g.shfl_xor(max_val, i);
-            max_val = (temp > max_val ? temp : max_val);
-        }
-
-        max_val = g.shfl(max_val, threadIdx.x / tbSize);
-    }
-
-    float sum = 0;
-    for (int i = 0; i < iterations; i++) {
-        data[i].x = __expf(data[i].x - max_val);
-        data[i].y = __expf(data[i].y - max_val);
-        data[i].z = __expf(data[i].z - max_val);
-        data[i].w = __expf(data[i].w - max_val);
-
-        sum += (data[i].x + data[i].y + data[i].z + data[i].w);
-    }
-
-    for (int i = 1; i < tbSize; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-    if (seq_length > tbSize) {
-        if (lane == 0) partialSum[wid] = sum;
-        b.sync();
-
-        if (lane < warp_num) sum = partialSum[lane];
-
-#ifndef __STOCHASTIC_MODE__
-        b.sync();
-#endif
-
-        int iters = warp_num;
-        if (seq_length < iteration_stride)
-            iters = warp_num / (iteration_stride / max_threads_in_sequence);
-
-        for (int i = 1; i < iters; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-        sum = g.shfl(sum, threadIdx.x / tbSize);
-    }
-
-    sum += 1e-6;
-
-    for (int i = 0; i < iterations; i++) {
-        data[i].x /= sum;
-        data[i].y /= sum;
-        data[i].z /= sum;
-        data[i].w /= sum;
-
-        int data_id = i * iteration_stride + seq_lane;
-        if (data_id < seq_length) val_cast[data_offset + data_id] = data[i];
-    }
-}
-
-template <int tbSize, int blockStride, int tbSeq>
-__global__ void attn_softmax(__half* vals,
-                             const __half* attn_mask,
-                             int heads,
-                             int seq_length,
-                             int iterations)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int warp_num = blockDim.x >> WARP_SIZE_BITS;
-
-    int iteration_stride = blockDim.x;
-    int block_width = blockStride * seq_length;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-
-    int batch = blockIdx.y;
-    int row = blockIdx.x;
-    int max_threads_in_sequence = std::max(seq_length, tbSeq);
-    int seq_lane = threadIdx.x % max_threads_in_sequence;
-
-    int data_offset = batch * (gridDim.x * block_width) + row * block_width +
-                      (threadIdx.x / max_threads_in_sequence) * seq_length;
-    int mask_offset = batch * seq_length;
-
-    int wid = threadIdx.x >> WARP_SIZE_BITS;
-    int lane = threadIdx.x & 0x1f;
-
-    float2* val_cast = reinterpret_cast<float2*>(vals);
-    const float2* attn_mask_cast = reinterpret_cast<const float2*>(attn_mask);
-
-    val_cast += data_offset;
-    attn_mask_cast += mask_offset;
-
-    float2 low_data[MAX_THREAD_ITERATIONS];
-    float2 high_data[MAX_THREAD_ITERATIONS];
-
-    float max_val = minus_infinity;
-
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + seq_lane;
-        if (data_id < seq_length) {
-            float2 data = val_cast[data_id];
-            float2 mask = attn_mask_cast[data_id];
-
-            __half2* data_arr = reinterpret_cast<__half2*>(&data);
-            __half2* mask_arr = reinterpret_cast<__half2*>(&mask);
-
-            low_data[i] = __half22float2(data_arr[0]);
-            high_data[i] = __half22float2(data_arr[1]);
-            float2 low_mask = __half22float2(mask_arr[0]);
-            float2 high_mask = __half22float2(mask_arr[1]);
-
-            low_data[i].x += low_mask.x;
-            low_data[i].y += low_mask.y;
-            high_data[i].x += high_mask.x;
-            high_data[i].y += high_mask.y;
-
-            max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
-            max_val = (low_data[i].y > max_val ? low_data[i].y : max_val);
-            max_val = (high_data[i].x > max_val ? high_data[i].x : max_val);
-            max_val = (high_data[i].y > max_val ? high_data[i].y : max_val);
-        }
-    }
-
-    for (int i = 1; i < tbSize; i *= 2) {
-        auto temp = g.shfl_xor(max_val, i);
-        max_val = (temp > max_val ? temp : max_val);
-    }
-
-    if (seq_length > tbSize) {
-        if (lane == 0) partialSum[wid] = max_val;
-        b.sync();
-
-        if (lane < warp_num) max_val = partialSum[lane];
-
-#ifndef __STOCHASTIC_MODE__
-        b.sync();
-#endif
-
-        int iters = warp_num;
-        if (seq_length < iteration_stride)
-            iters = warp_num / (iteration_stride / max_threads_in_sequence);
-
-        for (int i = 1; i < iters; i *= 2) {
-            auto temp = g.shfl_xor(max_val, i);
-            max_val = (temp > max_val ? temp : max_val);
-        }
-
-        max_val = g.shfl(max_val, threadIdx.x / tbSize);
-    }
-
-    float sum = 0;
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + seq_lane;
-        if (data_id < seq_length) {
-            low_data[i].x = __expf(low_data[i].x - max_val);
-            low_data[i].y = __expf(low_data[i].y - max_val);
-            high_data[i].x = __expf(high_data[i].x - max_val);
-            high_data[i].y = __expf(high_data[i].y - max_val);
-
-            sum += (low_data[i].x + low_data[i].y + high_data[i].x + high_data[i].y);
-        }
-    }
-
-    for (int i = 1; i < tbSize; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-    if (seq_length > tbSize) {
-        if (lane == 0) partialSum[wid] = sum;
-        b.sync();
-
-        if (lane < warp_num) sum = partialSum[lane];
-
-#ifndef __STOCHASTIC_MODE__
-        b.sync();
-#endif
-
-        int iters = warp_num;
-        if (seq_length < iteration_stride)
-            iters = warp_num / (iteration_stride / max_threads_in_sequence);
-
-        for (int i = 1; i < iters; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-        sum = g.shfl(sum, threadIdx.x / tbSize);
-    }
-
-    sum += 1e-6;
-
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + seq_lane;
-        if (data_id < seq_length) {
-            float2 result_f;
-            __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-            low_data[i].x /= sum;
-            low_data[i].y /= sum;
-            high_data[i].x /= sum;
-            high_data[i].y /= sum;
-
-            result_h[0] = __float22half2_rn(low_data[i]);
-            result_h[1] = __float22half2_rn(high_data[i]);
-
-            val_cast[data_id] = result_f;
-        }
-    }
-
-#endif
-}
-
-template <typename T>
-void launch_attn_softmax(T*, const T*, int, int, int, cudaStream_t);
-
-template <>
-void launch_attn_softmax<float>(float* vals,
-                                const float* attn_mask,
-                                int batch_size,
-                                int heads,
-                                int sequence_length,
-                                cudaStream_t stream)
-{
-    const int threads = 128;
-    int seq_length4 = sequence_length / 4;
-
-    dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
-
-    int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
-
-    dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
-                                            subblock_max_workload * threads)
-                                         : threads);
-    int iterations =
-        (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
-                                                 : MAX_THREAD_ITERATIONS);
-
-    if (sequence_length <= 8)
-        attn_softmax<2, (threads / 2), 2>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 16)
-        attn_softmax<4, (threads / 4), 4>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 32)
-        attn_softmax<8, (threads / 8), 8>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 64)
-        attn_softmax<16, (threads / 16), 16>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 128)
-        attn_softmax<32, (threads / 32), 32>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 256)
-        attn_softmax<32, (threads / 64), 64>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else {
-        const int threads = 256;
-        dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
-
-        int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
-
-        dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
-                                                subblock_max_workload * threads)
-                                             : threads);
-        iterations =
-            (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
-                                                     : MAX_THREAD_ITERATIONS);
-        if (sequence_length <= 512)
-            attn_softmax<32, (threads / 128), 128><<<grid_dim, block_dim, 0, stream>>>(
-                vals, attn_mask, heads, seq_length4, iterations);
-        else if (sequence_length < (MAX_THREADS * MAX_THREAD_ITERATIONS * 4))
-            attn_softmax<32, 1, 128><<<grid_dim, block_dim, 0, stream>>>(
-                vals, attn_mask, heads, seq_length4, iterations);
-        else
-            throw std::runtime_error(
-                "Unsupport Seq_Length! Check the restriction of the max_threads and "
-                "max_thread_iterations!");
-    }
-}
-
-template <>
-void launch_attn_softmax<__half>(__half* vals,
-                                 const __half* attn_mask,
-                                 int batch_size,
-                                 int heads,
-                                 int sequence_length,
-                                 cudaStream_t stream)
-{
-    const int threads = 128;
-    int seq_length4 = sequence_length / 4;
-
-    dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
-
-    int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
-
-    dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
-                                            subblock_max_workload * threads)
-                                         : threads);
-
-    int iterations =
-        (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
-                                                 : MAX_THREAD_ITERATIONS);
-
-    if (sequence_length <= 8)
-        attn_softmax<2, (threads / 2), 2>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 16)
-        attn_softmax<4, (threads / 4), 4>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 32)
-        attn_softmax<8, (threads / 8), 8>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 64)
-        attn_softmax<16, (threads / 16), 16>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 128)
-        attn_softmax<32, (threads / 32), 32>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 256)
-        attn_softmax<32, (threads / 64), 64>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else {
-        const int threads = 256;
-        dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
-
-        int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
-
-        dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
-                                                subblock_max_workload * threads)
-                                             : threads);
-        iterations =
-            (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
-                                                     : MAX_THREAD_ITERATIONS);
-        if (sequence_length <= 512)
-            attn_softmax<32, (threads / 128), 128><<<grid_dim, block_dim, 0, stream>>>(
-                vals, attn_mask, heads, seq_length4, iterations);
-        else if (sequence_length < (MAX_THREADS * MAX_THREAD_ITERATIONS * 4))
-            attn_softmax<32, 1, 128><<<grid_dim, block_dim, 0, stream>>>(
-                vals, attn_mask, heads, seq_length4, iterations);
-        else
-            throw std::runtime_error(
-                "Unsupport Seq_Length! Check the restriction of the max_threads and "
-                "max_thread_iterations!");
-    }
-}
-
-template <typename T, int tbSize, int blockStride>
-__global__ void softmax_backward_kernel(T* out_grad, const T* soft_inp, int seq_length)
-{
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int warp_num = blockDim.x >> WARP_SIZE_BITS;  // warp-count = num_threads / WARP_SIZE (32)
-
-    int iteration_stride = blockDim.x;
-    int block_width = blockStride * seq_length;
-
-    int iterations = (seq_length < (MAX_THREAD_ITERATIONS * iteration_stride)
-                          ? (seq_length + iteration_stride - 1) / iteration_stride
-                          : MAX_THREAD_ITERATIONS);
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-
-    int wid = id >> WARP_SIZE_BITS;
-    int lane = id & 0x1f;
-
-    T val_reg[MAX_THREAD_ITERATIONS];
-    T soft_reg[MAX_THREAD_ITERATIONS];
-    float grad_reg = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + id;
-        if (data_id < block_width) {
-            val_reg[i] = out_grad[row * block_width + data_id];
-            soft_reg[i] = soft_inp[row * block_width + data_id];
-
-            grad_reg += ((float)val_reg[i] *
-                         (float)soft_reg[i]);  // if done in half, the multiplication, we may lose
-                                               // 2% of accuracy in computation!!
-        }
-    }
-    for (int i = 1; i < tbSize; i *= 2) grad_reg += g.shfl_xor(grad_reg, i);
-
-    if (seq_length > tbSize) {
-        if (lane == 0) partialSum[wid] = grad_reg;
-        b.sync();
-
-        if (lane < warp_num) grad_reg = partialSum[lane];
-
-        int iters = warp_num;
-        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
-
-        for (int i = 1; i < iters; i *= 2) grad_reg += g.shfl_xor(grad_reg, i);
-
-        grad_reg = g.shfl(grad_reg, id / tbSize);
-    }
-
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + id;
-        if (data_id < block_width) {
-            float temp = (float)soft_reg[i] * ((float)val_reg[i] - grad_reg);
-            out_grad[row * block_width + data_id] = (T)temp;
-        }
-    }
-}
-
-template <typename T, int ITERATIONS>
-__global__ void softmax_backward_kernel_v2(T* grad /* input & output*/,
-                                           const T* output,
-                                           int softmax_length)
-{
-    int batch_idx = blockIdx.x * blockDim.y + threadIdx.y;
-    int offset = batch_idx * softmax_length + threadIdx.x;
-
-    grad += offset;
-    output += offset;
-
-    T grad_reg[ITERATIONS];
-    T output_reg[ITERATIONS];
-    float sum = 0.0;
-
-#pragma unroll
-    for (int i = 0; i < ITERATIONS; ++i) {
-        int curr_idx = threadIdx.x + i * WARP_SIZE;
-        if (curr_idx < softmax_length) {
-            grad_reg[i] = grad[i * WARP_SIZE];
-            output_reg[i] = output[i * WARP_SIZE];
-            sum += (float)grad_reg[i] * (float)output_reg[i];
-        }
-    }
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
-
-#pragma unroll
-    for (int i = 0; i < ITERATIONS; ++i) {
-        int curr_idx = threadIdx.x + i * WARP_SIZE;
-        if (curr_idx < softmax_length)
-            grad[i * WARP_SIZE] = (float)output_reg[i] * ((float)grad_reg[i] - sum);
-    }
-}
-
-template <typename T>
-void launch_attn_softmax_backward_v2(T* out_grad,
-                                     const T* soft_inp,
-                                     int batch_size,
-                                     int heads,
-                                     int seq_length,
-                                     cudaStream_t stream)
-{
-    const int warps_per_block = 4;
-    dim3 grid_dim(batch_size * heads * seq_length / warps_per_block);
-    dim3 block_dim(WARP_SIZE, warps_per_block);
-
-    if (seq_length <= 32)
-        softmax_backward_kernel_v2<T, 1>
-            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-    else if (seq_length <= 64)
-        softmax_backward_kernel_v2<T, 2>
-            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-    else if (seq_length <= 128)
-        softmax_backward_kernel_v2<T, 4>
-            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-    else if (seq_length <= 256)
-        softmax_backward_kernel_v2<T, 8>
-            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-    else if (seq_length <= 384)
-        softmax_backward_kernel_v2<T, 12>
-            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-    else if (seq_length <= 512)
-        softmax_backward_kernel_v2<T, 16>
-            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-    else if (seq_length <= 768)
-        softmax_backward_kernel_v2<T, 24>
-            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-    else if (seq_length <= 1024)
-        softmax_backward_kernel_v2<T, 32>
-            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-    else if (seq_length <= 2048)
-        softmax_backward_kernel_v2<T, 64>
-            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-    else
-        throw std::runtime_error(
-            std::string("Special sequence length found in softmax backward, seq_length: ") +
-            std::to_string(seq_length));
-}
-
-template void launch_attn_softmax_backward_v2<__half>(__half* out_grad,
-                                                      const __half* soft_inp,
-                                                      int batch_size,
-                                                      int heads,
-                                                      int seq_length,
-                                                      cudaStream_t stream);
-template void launch_attn_softmax_backward_v2<float>(float* out_grad,
-                                                     const float* soft_inp,
-                                                     int batch_size,
-                                                     int heads,
-                                                     int seq_length,
-                                                     cudaStream_t stream);
diff --git a/deepspeed/ops/csrc/transformer/softmax_kernels.hip b/deepspeed/ops/csrc/transformer/softmax_kernels.hip
deleted file mode 100644
index afe65b0c9cbdc6b10027db2ddd5c7e8f447e0c24..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/softmax_kernels.hip
+++ /dev/null
@@ -1,597 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include <math.h>
-#include "custom_hip_layers.h"
-#include "general_kernels_hip.h"
-
-namespace cg = cooperative_groups;
-
-dim3 get_attn_softmax_grid(int batch_size, int heads, int sequence_length, int threads)
-{
-    int seq_length4 = sequence_length / 4;
-    int block_compute_size =
-        (seq_length4 < threads ? (int)pow(2.0, floor(log2((float)(threads / seq_length4)))) : 1);
-    // Note that the Y and Z dimensions are limited to 65535, while X is basically unlimited:
-    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
-    // The batch size is typically relatively small, while the sequence length could potentially be
-    // arbitrarily large. We therefore place the batch size second to avoid hitting the Y limit.
-    unsigned x = heads * sequence_length / block_compute_size;
-    unsigned y = batch_size;
-    return {x, y};
-}
-
-// Fused attention + softmax
-template <int tbSize, int blockStride, int tbSeq>
-__global__ void attn_softmax(float* vals,
-                             const float* attn_mask,
-                             int heads,
-                             int seq_length,
-                             int iterations)
-{
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int warp_num = blockDim.x >> WARP_SIZE_BITS;
-
-    int iteration_stride = blockDim.x;
-    int block_width = blockStride * seq_length;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-
-    int batch = blockIdx.y;
-    int row = blockIdx.x;
-    int max_threads_in_sequence = ::max(seq_length, tbSeq);
-    int seq_lane = threadIdx.x % max_threads_in_sequence;
-
-    int data_offset = batch * (gridDim.x * block_width) + row * block_width +
-                      (threadIdx.x / max_threads_in_sequence) * seq_length;
-    int mask_offset = batch * seq_length;
-
-    int wid = threadIdx.x >> WARP_SIZE_BITS;
-    int lane = threadIdx.x & 0x1f;
-
-    float4* val_cast = reinterpret_cast<float4*>(vals);
-    const float4* attn_mask_cast = reinterpret_cast<const float4*>(attn_mask);
-
-    float4 data[MAX_THREAD_ITERATIONS];
-
-    float max_val = minus_infinity;
-
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + seq_lane;
-        if (data_id < seq_length) {
-            float4 mask = attn_mask_cast[mask_offset + data_id];
-            data[i] = val_cast[data_offset + data_id];
-
-            data[i].x += mask.x;
-            data[i].y += mask.y;
-            data[i].z += mask.z;
-            data[i].w += mask.w;
-
-            max_val = (data[i].x > max_val ? data[i].x : max_val);
-            max_val = (data[i].y > max_val ? data[i].y : max_val);
-            max_val = (data[i].z > max_val ? data[i].z : max_val);
-            max_val = (data[i].w > max_val ? data[i].w : max_val);
-        } else {
-            data[i].x = minus_infinity;
-            data[i].y = minus_infinity;
-            data[i].z = minus_infinity;
-            data[i].w = minus_infinity;
-        }
-    }
-
-    for (int i = 1; i < tbSize; i *= 2) {
-        auto temp = g.shfl_xor(max_val, i);
-        max_val = (temp > max_val ? temp : max_val);
-    }
-
-    if (seq_length > tbSize) {
-        if (lane == 0) partialSum[wid] = max_val;
-        b.sync();
-
-        if (lane < warp_num) max_val = partialSum[lane];
-
-#ifndef __STOCHASTIC_MODE__
-        b.sync();
-#endif
-
-        int iters = warp_num;
-        if (seq_length < iteration_stride)
-            iters = warp_num / (iteration_stride / max_threads_in_sequence);
-
-        for (int i = 1; i < iters; i *= 2) {
-            auto temp = g.shfl_xor(max_val, i);
-            max_val = (temp > max_val ? temp : max_val);
-        }
-
-        max_val = g.shfl(max_val, threadIdx.x / tbSize);
-    }
-
-    float sum = 0;
-    for (int i = 0; i < iterations; i++) {
-        data[i].x = __expf(data[i].x - max_val);
-        data[i].y = __expf(data[i].y - max_val);
-        data[i].z = __expf(data[i].z - max_val);
-        data[i].w = __expf(data[i].w - max_val);
-
-        sum += (data[i].x + data[i].y + data[i].z + data[i].w);
-    }
-
-    for (int i = 1; i < tbSize; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-    if (seq_length > tbSize) {
-        if (lane == 0) partialSum[wid] = sum;
-        b.sync();
-
-        if (lane < warp_num) sum = partialSum[lane];
-
-#ifndef __STOCHASTIC_MODE__
-        b.sync();
-#endif
-
-        int iters = warp_num;
-        if (seq_length < iteration_stride)
-            iters = warp_num / (iteration_stride / max_threads_in_sequence);
-
-        for (int i = 1; i < iters; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-        sum = g.shfl(sum, threadIdx.x / tbSize);
-    }
-
-    sum += 1e-6;
-
-    for (int i = 0; i < iterations; i++) {
-        data[i].x /= sum;
-        data[i].y /= sum;
-        data[i].z /= sum;
-        data[i].w /= sum;
-
-        int data_id = i * iteration_stride + seq_lane;
-        if (data_id < seq_length) val_cast[data_offset + data_id] = data[i];
-    }
-}
-
-template <int tbSize, int blockStride, int tbSeq>
-__global__ void attn_softmax(__half* vals,
-                             const __half* attn_mask,
-                             int heads,
-                             int seq_length,
-                             int iterations)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int warp_num = blockDim.x >> WARP_SIZE_BITS;
-
-    int iteration_stride = blockDim.x;
-    int block_width = blockStride * seq_length;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-
-    int batch = blockIdx.y;
-    int row = blockIdx.x;
-    int max_threads_in_sequence = ::max(seq_length, tbSeq);
-    int seq_lane = threadIdx.x % max_threads_in_sequence;
-
-    int data_offset = batch * (gridDim.x * block_width) + row * block_width +
-                      (threadIdx.x / max_threads_in_sequence) * seq_length;
-    int mask_offset = batch * seq_length;
-
-    int wid = threadIdx.x >> WARP_SIZE_BITS;
-    int lane = threadIdx.x & 0x1f;
-
-    float2* val_cast = reinterpret_cast<float2*>(vals);
-    const float2* attn_mask_cast = reinterpret_cast<const float2*>(attn_mask);
-
-    val_cast += data_offset;
-    attn_mask_cast += mask_offset;
-
-    float2 low_data[MAX_THREAD_ITERATIONS];
-    float2 high_data[MAX_THREAD_ITERATIONS];
-
-    float max_val = minus_infinity;
-
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + seq_lane;
-        if (data_id < seq_length) {
-            float2 data = val_cast[data_id];
-            float2 mask = attn_mask_cast[data_id];
-
-            __half2* data_arr = reinterpret_cast<__half2*>(&data);
-            __half2* mask_arr = reinterpret_cast<__half2*>(&mask);
-
-            low_data[i] = __half22float2(data_arr[0]);
-            high_data[i] = __half22float2(data_arr[1]);
-            float2 low_mask = __half22float2(mask_arr[0]);
-            float2 high_mask = __half22float2(mask_arr[1]);
-
-            low_data[i].x += low_mask.x;
-            low_data[i].y += low_mask.y;
-            high_data[i].x += high_mask.x;
-            high_data[i].y += high_mask.y;
-
-            max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
-            max_val = (low_data[i].y > max_val ? low_data[i].y : max_val);
-            max_val = (high_data[i].x > max_val ? high_data[i].x : max_val);
-            max_val = (high_data[i].y > max_val ? high_data[i].y : max_val);
-        }
-    }
-
-    for (int i = 1; i < tbSize; i *= 2) {
-        auto temp = g.shfl_xor(max_val, i);
-        max_val = (temp > max_val ? temp : max_val);
-    }
-
-    if (seq_length > tbSize) {
-        if (lane == 0) partialSum[wid] = max_val;
-        b.sync();
-
-        if (lane < warp_num) max_val = partialSum[lane];
-
-#ifndef __STOCHASTIC_MODE__
-        b.sync();
-#endif
-
-        int iters = warp_num;
-        if (seq_length < iteration_stride)
-            iters = warp_num / (iteration_stride / max_threads_in_sequence);
-
-        for (int i = 1; i < iters; i *= 2) {
-            auto temp = g.shfl_xor(max_val, i);
-            max_val = (temp > max_val ? temp : max_val);
-        }
-
-        max_val = g.shfl(max_val, threadIdx.x / tbSize);
-    }
-
-    float sum = 0;
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + seq_lane;
-        if (data_id < seq_length) {
-            low_data[i].x = __expf(low_data[i].x - max_val);
-            low_data[i].y = __expf(low_data[i].y - max_val);
-            high_data[i].x = __expf(high_data[i].x - max_val);
-            high_data[i].y = __expf(high_data[i].y - max_val);
-
-            sum += (low_data[i].x + low_data[i].y + high_data[i].x + high_data[i].y);
-        }
-    }
-
-    for (int i = 1; i < tbSize; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-    if (seq_length > tbSize) {
-        if (lane == 0) partialSum[wid] = sum;
-        b.sync();
-
-        if (lane < warp_num) sum = partialSum[lane];
-
-#ifndef __STOCHASTIC_MODE__
-        b.sync();
-#endif
-
-        int iters = warp_num;
-        if (seq_length < iteration_stride)
-            iters = warp_num / (iteration_stride / max_threads_in_sequence);
-
-        for (int i = 1; i < iters; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-        sum = g.shfl(sum, threadIdx.x / tbSize);
-    }
-
-    sum += 1e-6;
-
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + seq_lane;
-        if (data_id < seq_length) {
-            float2 result_f;
-            __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-            low_data[i].x /= sum;
-            low_data[i].y /= sum;
-            high_data[i].x /= sum;
-            high_data[i].y /= sum;
-
-            result_h[0] = __float22half2_rn(low_data[i]);
-            result_h[1] = __float22half2_rn(high_data[i]);
-
-            val_cast[data_id] = result_f;
-        }
-    }
-
-#endif
-}
-
-template <typename T>
-void launch_attn_softmax(T*, const T*, int, int, int, hipStream_t);
-
-template <>
-void launch_attn_softmax<float>(float* vals,
-                                const float* attn_mask,
-                                int batch_size,
-                                int heads,
-                                int sequence_length,
-                                hipStream_t stream)
-{
-    const int threads = 128;
-    int seq_length4 = sequence_length / 4;
-
-    dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
-
-    int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
-
-    dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
-                                            subblock_max_workload * threads)
-                                         : threads);
-    int iterations =
-        (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
-                                                 : MAX_THREAD_ITERATIONS);
-
-    if (sequence_length <= 8)
-       hipLaunchKernelGGL(( attn_softmax<2, (threads / 2), 2>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 16)
-       hipLaunchKernelGGL(( attn_softmax<4, (threads / 4), 4>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 32)
-       hipLaunchKernelGGL(( attn_softmax<8, (threads / 8), 8>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 64)
-       hipLaunchKernelGGL(( attn_softmax<16, (threads / 16), 16>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 128)
-       hipLaunchKernelGGL(( attn_softmax<32, (threads / 32), 32>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 256)
-       hipLaunchKernelGGL(( attn_softmax<32, (threads / 64), 64>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else {
-        const int threads = 256;
-        dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
-
-        int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
-
-        dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
-                                                subblock_max_workload * threads)
-                                             : threads);
-        iterations =
-            (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
-                                                     : MAX_THREAD_ITERATIONS);
-        if (sequence_length <= 512)
-           hipLaunchKernelGGL(( attn_softmax<32, (threads / 128), 128>), dim3(grid_dim), dim3(block_dim), 0, stream, 
-                vals, attn_mask, heads, seq_length4, iterations);
-        else if (sequence_length < (MAX_THREADS * MAX_THREAD_ITERATIONS * 4))
-           hipLaunchKernelGGL(( attn_softmax<32, 1, 128>), dim3(grid_dim), dim3(block_dim), 0, stream, 
-                vals, attn_mask, heads, seq_length4, iterations);
-        else
-            throw std::runtime_error(
-                "Unsupport Seq_Length! Check the restriction of the max_threads and "
-                "max_thread_iterations!");
-    }
-}
-
-template <>
-void launch_attn_softmax<__half>(__half* vals,
-                                 const __half* attn_mask,
-                                 int batch_size,
-                                 int heads,
-                                 int sequence_length,
-                                 hipStream_t stream)
-{
-    const int threads = 128;
-    int seq_length4 = sequence_length / 4;
-
-    dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
-
-    int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
-
-    dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
-                                            subblock_max_workload * threads)
-                                         : threads);
-
-    int iterations =
-        (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
-                                                 : MAX_THREAD_ITERATIONS);
-
-    if (sequence_length <= 8)
-       hipLaunchKernelGGL(( attn_softmax<2, (threads / 2), 2>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 16)
-       hipLaunchKernelGGL(( attn_softmax<4, (threads / 4), 4>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 32)
-       hipLaunchKernelGGL(( attn_softmax<8, (threads / 8), 8>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 64)
-       hipLaunchKernelGGL(( attn_softmax<16, (threads / 16), 16>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 128)
-       hipLaunchKernelGGL(( attn_softmax<32, (threads / 32), 32>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 256)
-       hipLaunchKernelGGL(( attn_softmax<32, (threads / 64), 64>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else {
-        const int threads = 256;
-        dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
-
-        int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
-
-        dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
-                                                subblock_max_workload * threads)
-                                             : threads);
-        iterations =
-            (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
-                                                     : MAX_THREAD_ITERATIONS);
-        if (sequence_length <= 512)
-           hipLaunchKernelGGL(( attn_softmax<32, (threads / 128), 128>), dim3(grid_dim), dim3(block_dim), 0, stream, 
-                vals, attn_mask, heads, seq_length4, iterations);
-        else if (sequence_length < (MAX_THREADS * MAX_THREAD_ITERATIONS * 4))
-           hipLaunchKernelGGL(( attn_softmax<32, 1, 128>), dim3(grid_dim), dim3(block_dim), 0, stream, 
-                vals, attn_mask, heads, seq_length4, iterations);
-        else
-            throw std::runtime_error(
-                "Unsupport Seq_Length! Check the restriction of the max_threads and "
-                "max_thread_iterations!");
-    }
-}
-
-template <typename T, int tbSize, int blockStride>
-__global__ void softmax_backward_kernel(T* out_grad, const T* soft_inp, int seq_length)
-{
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int warp_num = blockDim.x >> WARP_SIZE_BITS;  // warp-count = num_threads / WARP_SIZE (32)
-
-    int iteration_stride = blockDim.x;
-    int block_width = blockStride * seq_length;
-
-    int iterations = (seq_length < (MAX_THREAD_ITERATIONS * iteration_stride)
-                          ? (seq_length + iteration_stride - 1) / iteration_stride
-                          : MAX_THREAD_ITERATIONS);
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-
-    int wid = id >> WARP_SIZE_BITS;
-    int lane = id & 0x1f;
-
-    T val_reg[MAX_THREAD_ITERATIONS];
-    T soft_reg[MAX_THREAD_ITERATIONS];
-    float grad_reg = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + id;
-        if (data_id < block_width) {
-            val_reg[i] = out_grad[row * block_width + data_id];
-            soft_reg[i] = soft_inp[row * block_width + data_id];
-
-            grad_reg += ((float)val_reg[i] *
-                         (float)soft_reg[i]);  // if done in half, the multiplication, we may lose
-                                               // 2% of accuracy in computation!!
-        }
-    }
-    for (int i = 1; i < tbSize; i *= 2) grad_reg += g.shfl_xor(grad_reg, i);
-
-    if (seq_length > tbSize) {
-        if (lane == 0) partialSum[wid] = grad_reg;
-        b.sync();
-
-        if (lane < warp_num) grad_reg = partialSum[lane];
-
-        int iters = warp_num;
-        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
-
-        for (int i = 1; i < iters; i *= 2) grad_reg += g.shfl_xor(grad_reg, i);
-
-        grad_reg = g.shfl(grad_reg, id / tbSize);
-    }
-
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + id;
-        if (data_id < block_width) {
-            float temp = (float)soft_reg[i] * ((float)val_reg[i] - grad_reg);
-            out_grad[row * block_width + data_id] = (T)temp;
-        }
-    }
-}
-
-template <typename T, int ITERATIONS>
-__global__ void softmax_backward_kernel_v2(T* grad /* input & output*/,
-                                           const T* output,
-                                           int softmax_length)
-{
-    int batch_idx = blockIdx.x * blockDim.y + threadIdx.y;
-    int offset = batch_idx * softmax_length + threadIdx.x;
-
-    grad += offset;
-    output += offset;
-
-    T grad_reg[ITERATIONS];
-    T output_reg[ITERATIONS];
-    float sum = 0.0;
-
-#pragma unroll
-    for (int i = 0; i < ITERATIONS; ++i) {
-        int curr_idx = threadIdx.x + i * WARP_SIZE;
-        if (curr_idx < softmax_length) {
-            grad_reg[i] = grad[i * WARP_SIZE];
-            output_reg[i] = output[i * WARP_SIZE];
-            sum += (float)grad_reg[i] * (float)output_reg[i];
-        }
-    }
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
-
-#pragma unroll
-    for (int i = 0; i < ITERATIONS; ++i) {
-        int curr_idx = threadIdx.x + i * WARP_SIZE;
-        if (curr_idx < softmax_length)
-            grad[i * WARP_SIZE] = (float)output_reg[i] * ((float)grad_reg[i] - sum);
-    }
-}
-
-template <typename T>
-void launch_attn_softmax_backward_v2(T* out_grad,
-                                     const T* soft_inp,
-                                     int batch_size,
-                                     int heads,
-                                     int seq_length,
-                                     hipStream_t stream)
-{
-    const int warps_per_block = 4;
-    dim3 grid_dim(batch_size * heads * seq_length / warps_per_block);
-    dim3 block_dim(WARP_SIZE, warps_per_block);
-
-    if (seq_length <= 32)
-       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 1>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
-    else if (seq_length <= 64)
-       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 2>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
-    else if (seq_length <= 128)
-       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 4>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
-    else if (seq_length <= 256)
-       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 8>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
-    else if (seq_length <= 384)
-       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 12>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
-    else if (seq_length <= 512)
-       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 16>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
-    else if (seq_length <= 768)
-       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 24>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
-    else if (seq_length <= 1024)
-       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 32>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
-    else if (seq_length <= 2048)
-       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 64>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
-    else
-        throw std::runtime_error(
-            std::string("Special sequence length found in softmax backward, seq_length: ") +
-            std::to_string(seq_length));
-}
-
-template void launch_attn_softmax_backward_v2<__half>(__half* out_grad,
-                                                      const __half* soft_inp,
-                                                      int batch_size,
-                                                      int heads,
-                                                      int seq_length,
-                                                      hipStream_t stream);
-template void launch_attn_softmax_backward_v2<float>(float* out_grad,
-                                                     const float* soft_inp,
-                                                     int batch_size,
-                                                     int heads,
-                                                     int seq_length,
-                                                     hipStream_t stream);
diff --git a/deepspeed/ops/csrc/transformer/transform_kernels.cu b/deepspeed/ops/csrc/transformer/transform_kernels.cu
deleted file mode 100644
index 15a2219333e43a6da1b93038a406b35d302bb9d9..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/transform_kernels.cu
+++ /dev/null
@@ -1,575 +0,0 @@
-#include "custom_cuda_layers.h"
-
-#define rows_trans 16
-#define cols_trans 16
-
-template <typename T>
-__global__ void Transpose_Kernel(const T* inp, T* out, int row_width, int col_width)
-{
-    __shared__ T data_block[rows_trans * (cols_trans + 1)];
-
-    int r = threadIdx.x / cols_trans;
-    int c = threadIdx.x % cols_trans;
-
-    int m = row_width / cols_trans;
-
-    int i = blockIdx.x / m * rows_trans + r;
-    int j = blockIdx.x % m * cols_trans + c;
-
-    int row_stride = rows_trans / ((rows_trans * cols_trans + THREADS - 1) / THREADS);
-
-    for (int k = 0; k < rows_trans; k += row_stride)
-        data_block[(k + r) * cols_trans + c] = inp[(i + k) * row_width + j];
-
-    __syncthreads();
-
-    i = blockIdx.x % m * rows_trans + r;
-    j = blockIdx.x / m * cols_trans + c;
-
-    for (int k = 0; k < rows_trans; k += row_stride)
-        out[(i + k) * col_width + j] = data_block[c * cols_trans + r + k];
-}
-
-template <>
-void Transpose<__half>(const __half* inp_mat,
-                       __half* out_mat,
-                       int rows,
-                       int cols,
-                       cudaStream_t stream)
-{
-    int threads = THREADS;
-
-    Transpose_Kernel<__half><<<(rows * cols + threads - 1) / threads, threads, 0, stream>>>(
-        inp_mat, out_mat, cols, rows);
-}
-
-template <>
-void Transpose<float>(const float* inp_mat, float* out_mat, int rows, int cols, cudaStream_t stream)
-{
-    int threads = THREADS;
-
-    Transpose_Kernel<float><<<(rows * cols + threads - 1) / threads, threads, 0, stream>>>(
-        inp_mat, out_mat, cols, rows);
-}
-
-template <typename T>
-__global__ void transform_0213(T* output,
-                               const T* vals,
-                               int hidden_dim,
-                               int seq_length,
-                               int heads,
-                               int head_ext);
-
-template <>
-__global__ void transform_0213<float>(float* output,
-                                      const float* vals,
-                                      int hidden_dim,
-                                      int seq_length,
-                                      int heads,
-                                      int head_ext)
-{
-    int d0_stride = hidden_dim * seq_length;
-    int d1_stride = hidden_dim;
-    int d2_stride = hidden_dim / heads;
-
-    int d0_out_stride = d0_stride;
-    int d1_out_stride = d2_stride;
-    int d2_out_stride = d2_stride * seq_length;
-
-    int d0 = blockIdx.x;                                                  // Batch
-    int d1 = blockIdx.y / head_ext;                                       // Sequence ID (0-127)
-    int d2 = threadIdx.y + (blockIdx.y % head_ext) * (heads / head_ext);  // Head (0-11)
-    int d3 = threadIdx.x;                                                 // Values (groups of 4)
-
-    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
-    float4* output_vec = reinterpret_cast<float4*>(output);
-
-    float4 inputs = vals_vec[d0 * d0_stride + d1 * d1_stride + d2 * d2_stride + d3];
-    output_vec[d0 * d0_out_stride + d1 * d1_out_stride + d2 * d2_out_stride + d3] = inputs;
-}
-
-template <>
-__global__ void transform_0213<__half>(__half* output,
-                                       const __half* vals,
-                                       int hidden_dim,
-                                       int seq_length,
-                                       int heads,
-                                       int head_ext)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    int d0_stride = hidden_dim * seq_length;
-    int d1_stride = hidden_dim;
-    int d2_stride = hidden_dim / heads;
-
-    int d0_out_stride = d0_stride;
-    int d1_out_stride = d2_stride;
-    int d2_out_stride = d2_stride * seq_length;
-
-    int d0 = blockIdx.x;                                                  // Batch
-    int d1 = blockIdx.y / head_ext;                                       // Sequence ID (0-127)
-    int d2 = threadIdx.y + (blockIdx.y % head_ext) * (heads / head_ext);  // Head (0-11)
-    int d3 = threadIdx.x;                                                 // Values (groups of 4)
-
-    float4 vals_arr[1];
-
-    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
-    float4* output_vec = reinterpret_cast<float4*>(output);
-
-    vals_arr[0] = vals_vec[d0 * d0_stride + d1 * d1_stride + d2 * d2_stride + d3];
-    output_vec[d0 * d0_out_stride + d1 * d1_out_stride + d2 * d2_out_stride + d3] = vals_arr[0];
-#endif
-}
-
-template <>
-void launch_transform_0213<float>(float* output,
-                                  const float* vals,
-                                  int batch_size,
-                                  int seq_length,
-                                  int hidden_dim,
-                                  int heads,
-                                  cudaStream_t stream)
-{
-    hidden_dim >>= 2;
-    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
-    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
-    dim3 grid_dim(batch_size, (seq_length * head_ext));
-
-    transform_0213<float>
-        <<<grid_dim, block_dim, 0, stream>>>(output, vals, hidden_dim, seq_length, heads, head_ext);
-}
-
-template <>
-void launch_transform_0213<__half>(__half* output,
-                                   const __half* vals,
-                                   int batch_size,
-                                   int seq_length,
-                                   int hidden_dim,
-                                   int heads,
-                                   cudaStream_t stream)
-{
-    hidden_dim >>= 3;
-    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
-    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
-    dim3 grid_dim(batch_size, (seq_length * head_ext));
-    transform_0213<__half>
-        <<<grid_dim, block_dim, 0, stream>>>(output, vals, hidden_dim, seq_length, heads, head_ext);
-}
-
-// Bias add
-template <typename T>
-__global__ void bias_add_transform_0213(T* output,
-                                        const T* vals,
-                                        const T* bias,
-                                        int hidden_dim,
-                                        int seq_length,
-                                        int heads,
-                                        int head_ext);
-
-template <>
-__global__ void bias_add_transform_0213<float>(float* output,
-                                               const float* vals,
-                                               const float* bias,
-                                               int hidden_dim,
-                                               int seq_length,
-                                               int heads,
-                                               int head_ext)
-{
-    int d0_stride = hidden_dim * seq_length;
-    int d1_stride = hidden_dim;
-    int d2_stride = hidden_dim / heads;
-
-    int d0_out_stride = d0_stride;
-    int d1_out_stride = d2_stride;
-    int d2_out_stride = d2_stride * seq_length;
-
-    int d0 = blockIdx.x;                                                  // Batch
-    int d1 = blockIdx.y;                                                  // Sequence ID (0-127)
-    int cnt = blockIdx.z / head_ext;                                      // Hidden count
-    int d2 = threadIdx.y + (blockIdx.z % head_ext) * (heads / head_ext);  // Head (0-11)
-    int d3 = threadIdx.x;                                                 // Values (groups of 4)
-
-    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
-    const float4* bias_vec = reinterpret_cast<const float4*>(bias);
-    float4* output_vec = reinterpret_cast<float4*>(output);
-
-    float4 inputs = vals_vec[d0 * d0_stride * (gridDim.z / head_ext) + cnt * d1_stride +
-                             d1 * d1_stride * (gridDim.z / head_ext) + d2 * d2_stride + d3];
-    float4 biases = bias_vec[cnt * d1_stride + d2 * d2_stride + d3];
-
-    float4 outputs;
-    outputs.x = inputs.x + biases.x;
-    outputs.y = inputs.y + biases.y;
-    outputs.z = inputs.z + biases.z;
-    outputs.w = inputs.w + biases.w;
-
-    output_vec[cnt * d0_out_stride * gridDim.x + d0 * d0_out_stride + d1 * d1_out_stride +
-               d2 * d2_out_stride + d3] = outputs;
-}
-
-#define ATTN_H 3
-#define MAX_SEQ_LINE 10
-
-template <>
-__global__ void bias_add_transform_0213<__half>(__half* output,
-                                                const __half* vals,
-                                                const __half* bias,
-                                                int hidden_dim,
-                                                int seq_length,
-                                                int heads,
-                                                int head_ext)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    int d0_stride = hidden_dim * seq_length;
-    int d1_stride = hidden_dim;
-    int d2_stride = hidden_dim / heads;
-
-    int d2_out_stride = d2_stride * seq_length;
-
-    int d0 = blockIdx.x;                                                  // Batch
-    int d1 = blockIdx.y;                                                  // Sequence ID (0-127)
-    int cnt = blockIdx.z / head_ext;                                      // Hidden count
-    int d2 = threadIdx.y + (blockIdx.z % head_ext) * (heads / head_ext);  // Head (0-11)
-    int d3 = threadIdx.x;                                                 // Values (groups of 4)
-
-    float4 vals_arr;
-    float4 bias_arr;
-    float4 output_arr;
-    __half2* vals_half = reinterpret_cast<__half2*>(&vals_arr);
-    __half2* bias_half = reinterpret_cast<__half2*>(&bias_arr);
-    __half2* output_half = reinterpret_cast<__half2*>(&output_arr);
-
-    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
-    const float4* bias_vec = reinterpret_cast<const float4*>(bias);
-    float4* output_vec = reinterpret_cast<float4*>(output);
-
-    vals_vec += (d0 * d0_stride * (gridDim.z / head_ext));
-    vals_vec += (d1 * d1_stride * (gridDim.z / head_ext));
-    vals_vec += (cnt * d1_stride);
-    vals_vec += (d2 * d2_stride);
-
-    bias_vec += (cnt * d1_stride);
-    bias_vec += (d2 * d2_stride);
-
-    output_vec += (cnt * d0_stride * gridDim.x);
-    output_vec += (d1 * d2_stride);
-    output_vec += (d0 * d0_stride);
-    output_vec += (d2 * d2_out_stride);
-
-    bias_arr = bias_vec[d3];
-    vals_arr = vals_vec[d3];
-
-#if defined(__ACC_HALF__)
-    output_half[0] = vals_half[0] + bias_half[0];
-    output_half[1] = vals_half[1] + bias_half[1];
-    output_half[2] = vals_half[2] + bias_half[2];
-    output_half[3] = vals_half[3] + bias_half[3];
-#else
-    float2 bias_arr_f[4];
-    float2 vals_arr_f[4];
-#pragma unroll
-    for (int l = 0; l < 4; l++) {
-        bias_arr_f[l] = __half22float2(bias_half[l]);
-        vals_arr_f[l] = __half22float2(vals_half[l]);
-        vals_arr_f[l].x += bias_arr_f[l].x;
-        vals_arr_f[l].y += bias_arr_f[l].y;
-        output_half[l] = __float22half2_rn(vals_arr_f[l]);
-    }
-#endif
-    output_vec[d3] = output_arr;
-
-#endif
-}
-
-__global__ void bias_add_transform_0213_v2(__half* output,
-                                           const __half* vals,
-                                           const __half* bias,
-                                           int hidden_dim,
-                                           int seq_length,
-                                           int heads)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    __shared__ float4 in_data[3072];
-
-    int d0_stride = hidden_dim * seq_length;
-    int d1_stride = hidden_dim;
-    int d2_stride = hidden_dim / heads;
-    int iteration_stride = d1_stride * blockDim.z;  // Hidden * 3 / 8
-    int batch_stride = d0_stride * blockDim.z;      // Hidden * S * 3 / 8
-
-    int d0_out_stride = d0_stride;
-    int d1_out_stride = d2_stride;
-    int d2_out_stride = d2_stride * seq_length;
-
-    int d0 = blockIdx.x;    // Batch
-    int d1 = blockIdx.y;    // Sequence ID (0-127)
-    int cnt = threadIdx.z;  // blockIdx.z; // Hidden count
-    int d2 = threadIdx.y;   // Head (0-11)
-    int d3 = threadIdx.x;   // Values (groups of 4)
-
-    float4 vals_arr[1];
-    float4 bias_arr[1];
-    float4 output_arr[1];
-    __half2* vals_half = reinterpret_cast<__half2*>(vals_arr);
-    __half2* bias_half = reinterpret_cast<__half2*>(bias_arr);
-    __half2* output_half = reinterpret_cast<__half2*>(output_arr);
-
-    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
-    const float4* bias_vec = reinterpret_cast<const float4*>(bias);
-    float4* output_vec = reinterpret_cast<float4*>(output);
-
-    int iter_index = cnt * d1_stride + d2 * d2_stride + d3;
-    int input_offset = d0 * batch_stride + d1 * (iteration_stride << 1);
-    bias_arr[0] = bias_vec[iter_index];
-
-#pragma unroll
-    for (int iter = 0; iter < 2; iter++) {
-        int iter_id = iter * iteration_stride + iter_index;
-        vals_arr[0] = vals_vec[input_offset + iter_id];
-
-        output_half[0] = vals_half[0] + bias_half[0];
-        output_half[1] = vals_half[1] + bias_half[1];
-        output_half[2] = vals_half[2] + bias_half[2];
-        output_half[3] = vals_half[3] + bias_half[3];
-
-        in_data[iter_id] = output_arr[0];
-    }
-    __syncthreads();
-
-    iteration_stride = blockDim.z * (blockDim.y >> 1);
-    int matrix_stride = (d0_out_stride * gridDim.x);
-    int head_count = (d2 >> 1) + cnt * (blockDim.y >> 1);
-
-    int out_index = d0 * d0_out_stride + d1 * (d1_out_stride << 1) + d3 + (d2 % 2) * d2_stride;
-
-#pragma unroll
-    for (int iter = 0; iter < 2; iter++) {
-        int iter_row = (iter * iteration_stride) + head_count;
-        int iter_offset =
-            (iter_row % blockDim.y) * d2_out_stride + (iter_row / blockDim.y) * matrix_stride;
-        output_vec[out_index + iter_offset] =
-            in_data[iter_row * d2_stride + d3 + (d2 % 2) * (d1_stride * blockDim.z)];
-    }
-#endif
-}
-
-// [B S C*H] - > C * [B A S N]
-template <>
-void launch_bias_add_transform_0213<float>(float* output,
-                                           const float* vals,
-                                           const float* bias,
-                                           int batch_size,
-                                           int seq_length,
-                                           int hidden_dim,
-                                           int heads,
-                                           cudaStream_t stream,
-                                           int trans_count)
-{
-    hidden_dim >>= 2;
-    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
-
-    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
-    dim3 grid_dim(batch_size, seq_length, (trans_count * head_ext));
-
-    bias_add_transform_0213<float><<<grid_dim, block_dim, 0, stream>>>(
-        output, vals, bias, hidden_dim, seq_length, heads, head_ext);
-}
-
-template <>
-void launch_bias_add_transform_0213<__half>(__half* output,
-                                            const __half* vals,
-                                            const __half* bias,
-                                            int batch_size,
-                                            int seq_length,
-                                            int hidden_dim,
-                                            int heads,
-                                            cudaStream_t stream,
-                                            int trans_count)
-{
-    hidden_dim >>= 3;
-    if (hidden_dim > 128 || hidden_dim < 16) {
-        int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
-        dim3 block_dim(hidden_dim / heads, (heads / head_ext));
-        dim3 grid_dim(batch_size, seq_length, (trans_count * head_ext));
-        bias_add_transform_0213<__half><<<grid_dim, block_dim, 0, stream>>>(
-            output, vals, bias, hidden_dim, seq_length, heads, head_ext);
-    } else {
-        dim3 block_dim(hidden_dim / heads, heads, trans_count);
-        dim3 grid_dim(batch_size, seq_length / 2);
-        bias_add_transform_0213_v2<<<grid_dim, block_dim, 0, stream>>>(
-            output, vals, bias, hidden_dim, seq_length, heads);
-    }
-}
-
-template <typename T>
-__global__ void transform4d_0213(T* out,
-                                 const T* in,
-                                 int heads,
-                                 int seq_length,
-                                 int hidden_dim,
-                                 int head_ext);
-
-template <>
-__global__ void transform4d_0213<float>(float* out,
-                                        const float* in,
-                                        int heads,
-                                        int seq_length,
-                                        int hidden_dim,
-                                        int head_ext)
-{
-    int d0_stride = hidden_dim * seq_length;
-    int d1_stride = d0_stride / heads;
-    int d2_stride = hidden_dim / heads;
-
-    int d0_out_stride = d0_stride;
-    int d1_out_stride = d2_stride;
-    int d2_out_stride = hidden_dim;
-
-    int d0 = blockIdx.x;                                        // Batch
-    int d1 = blockIdx.y / ((seq_length - 1) / blockDim.y + 1);  // Head
-    int d2 = (threadIdx.y + blockDim.y * blockIdx.y) % seq_length;
-    int cnt = blockIdx.z;
-    int d3 = threadIdx.x;  // Values (groups of 8)
-
-    if (d2 < seq_length) {
-        const float4* in_vec = reinterpret_cast<const float4*>(in);
-        float4* out_vec = reinterpret_cast<float4*>(out);
-
-        float4 vals_vec = in_vec[cnt * d0_stride * gridDim.x + d0 * d0_stride + d1 * d1_stride +
-                                 d2 * d2_stride + d3];
-        out_vec[d0 * d0_out_stride * gridDim.z + cnt * d2_out_stride + d1 * d1_out_stride +
-                d2 * d2_out_stride * gridDim.z + d3] = vals_vec;
-    }
-}
-
-template <>
-__global__ void transform4d_0213<__half>(__half* out,
-                                         const __half* in,
-                                         int heads,
-                                         int seq_length,
-                                         int hidden_dim,
-                                         int head_ext)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    int d0_stride = hidden_dim * (seq_length / head_ext);
-    int d1_stride = hidden_dim;
-    int d2_stride = hidden_dim / heads;
-
-    int d0 = blockIdx.x;                                                  // Batch
-    int d1 = threadIdx.y + (blockIdx.z % head_ext) * (heads / head_ext);  // Head
-    int d2 = blockIdx.z / head_ext;                                       // Sequence
-    int cnt = blockIdx.y;                                                 // Hidden count
-    int d3 = threadIdx.x;                                                 // Values (groups of 8)
-
-    const float4* in_vec = reinterpret_cast<const float4*>(in);
-    float4* out_vec = reinterpret_cast<float4*>(out);
-
-    in_vec += (cnt * d0_stride * gridDim.x);
-    in_vec += (d0 * d0_stride);
-    in_vec += (d2 * d2_stride);
-    in_vec += (d1 * d2_stride * seq_length);
-
-    out_vec += (cnt * d1_stride);
-    out_vec += (d1 * d2_stride);
-    out_vec += (d0 * d0_stride * gridDim.y);
-    out_vec += (d2 * d1_stride * gridDim.y);
-
-    out_vec[d3] = in_vec[d3];
-
-#endif
-}
-
-__global__ void transform4d_0213_v2(__half* out,
-                                    const __half* in,
-                                    int heads,
-                                    int seq_length,
-                                    int hidden_dim)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    __shared__ float4 in_data[3072];
-
-    int d0_stride = hidden_dim * seq_length;
-    int d1_stride = hidden_dim;
-    int d2_stride = hidden_dim / heads;
-
-    int d0 = blockIdx.x;    // Batch
-    int d1 = threadIdx.y;   // Head
-    int d2 = blockIdx.y;    // Sequence
-    int cnt = threadIdx.z;  // Hidden count
-    int d3 = threadIdx.x;   // Values (groups of 8)
-
-    const float4* in_vec = reinterpret_cast<const float4*>(in);
-    float4* out_vec = reinterpret_cast<float4*>(out);
-
-    int input_offset = d0 * d0_stride + d2 * (d2_stride << 1) + d3 + (d1 % 2) * d2_stride;
-    int head_count = (d1 >> 1) + cnt * (blockDim.y >> 1);
-    int iteration_stride = blockDim.z * (blockDim.y >> 1);
-    int matrix_stride = (d0_stride * gridDim.x);
-
-#pragma unroll
-    for (int iter = 0; iter < 2; iter++) {
-        int iter_row = iter * iteration_stride + head_count;
-        int iter_offset = (iter_row % blockDim.y) * d2_stride;
-
-        in_data[d3 + iter_offset + (iter_row / blockDim.y + (d1 % 2) * blockDim.z) * d1_stride] =
-            in_vec[input_offset + iter_offset * seq_length +
-                   (iter_row / blockDim.y) * matrix_stride];
-    }
-    __syncthreads();
-
-    iteration_stride = d1_stride * blockDim.z;
-    int iter_index = cnt * d1_stride + d1 * d2_stride + d3;
-    int output_offset = d0 * d0_stride * blockDim.z + d2 * (iteration_stride << 1);
-
-#pragma unroll
-    for (int iter = 0; iter < 2; iter++) {
-        int iter_id = iter * iteration_stride + iter_index;
-        out_vec[output_offset + iter_id] = in_data[iter_id];
-    }
-#endif
-}
-
-// 3 * [B A S N] - > [B S C*H]
-template <>
-void launch_transform4d_0213<float>(float* out,
-                                    const float* in,
-                                    int batch_size,
-                                    int heads,
-                                    int seq_length,
-                                    int hidden_dim,
-                                    cudaStream_t stream,
-                                    int trans_count)
-{
-    hidden_dim >>= 2;
-    dim3 grid_dims(batch_size, heads * ((seq_length - 1) / 8 + 1), trans_count);
-    dim3 block_dims(hidden_dim / heads, 8);
-    transform4d_0213<float>
-        <<<grid_dims, block_dims, 0, stream>>>(out, in, heads, seq_length, hidden_dim, 1);
-}
-
-template <>
-void launch_transform4d_0213<__half>(__half* out,
-                                     const __half* in,
-                                     int batch_size,
-                                     int heads,
-                                     int seq_length,
-                                     int hidden_dim,
-                                     cudaStream_t stream,
-                                     int trans_count)
-{
-    hidden_dim >>= 3;
-    if (hidden_dim > 128 || hidden_dim < 16) {
-        int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
-        dim3 grid_dims(batch_size, trans_count, (seq_length * head_ext));
-        dim3 block_dims(hidden_dim / heads, (heads / head_ext));
-        transform4d_0213<__half><<<grid_dims, block_dims, 0, stream>>>(
-            out, in, heads, seq_length, hidden_dim, head_ext);
-    } else {
-        dim3 grid_dims(batch_size, seq_length / 2);
-        dim3 block_dims(hidden_dim / heads, heads, trans_count);
-        transform4d_0213_v2<<<grid_dims, block_dims, 0, stream>>>(
-            out, in, heads, seq_length, hidden_dim);
-    }
-}
diff --git a/deepspeed/ops/csrc/transformer/transform_kernels.hip b/deepspeed/ops/csrc/transformer/transform_kernels.hip
deleted file mode 100644
index 0aaa4cca150e18ed63c701e66ce4eaf6313e30ab..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer/transform_kernels.hip
+++ /dev/null
@@ -1,577 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-
-#define rows_trans 16
-#define cols_trans 16
-
-template <typename T>
-__global__ void Transpose_Kernel(const T* inp, T* out, int row_width, int col_width)
-{
-    __shared__ T data_block[rows_trans * (cols_trans + 1)];
-
-    int r = threadIdx.x / cols_trans;
-    int c = threadIdx.x % cols_trans;
-
-    int m = row_width / cols_trans;
-
-    int i = blockIdx.x / m * rows_trans + r;
-    int j = blockIdx.x % m * cols_trans + c;
-
-    int row_stride = rows_trans / ((rows_trans * cols_trans + THREADS - 1) / THREADS);
-
-    for (int k = 0; k < rows_trans; k += row_stride)
-        data_block[(k + r) * cols_trans + c] = inp[(i + k) * row_width + j];
-
-    __syncthreads();
-
-    i = blockIdx.x % m * rows_trans + r;
-    j = blockIdx.x / m * cols_trans + c;
-
-    for (int k = 0; k < rows_trans; k += row_stride)
-        out[(i + k) * col_width + j] = data_block[c * cols_trans + r + k];
-}
-
-template <>
-void Transpose<__half>(const __half* inp_mat,
-                       __half* out_mat,
-                       int rows,
-                       int cols,
-                       hipStream_t stream)
-{
-    int threads = THREADS;
-
-   hipLaunchKernelGGL(( Transpose_Kernel<__half>), dim3((rows * cols + threads - 1) / threads), dim3(threads), 0, stream, 
-        inp_mat, out_mat, cols, rows);
-}
-
-template <>
-void Transpose<float>(const float* inp_mat, float* out_mat, int rows, int cols, hipStream_t stream)
-{
-    int threads = THREADS;
-
-   hipLaunchKernelGGL(( Transpose_Kernel<float>), dim3((rows * cols + threads - 1) / threads), dim3(threads), 0, stream, 
-        inp_mat, out_mat, cols, rows);
-}
-
-template <typename T>
-__global__ void transform_0213(T* output,
-                               const T* vals,
-                               int hidden_dim,
-                               int seq_length,
-                               int heads,
-                               int head_ext);
-
-template <>
-__global__ void transform_0213<float>(float* output,
-                                      const float* vals,
-                                      int hidden_dim,
-                                      int seq_length,
-                                      int heads,
-                                      int head_ext)
-{
-    int d0_stride = hidden_dim * seq_length;
-    int d1_stride = hidden_dim;
-    int d2_stride = hidden_dim / heads;
-
-    int d0_out_stride = d0_stride;
-    int d1_out_stride = d2_stride;
-    int d2_out_stride = d2_stride * seq_length;
-
-    int d0 = blockIdx.x;                                                  // Batch
-    int d1 = blockIdx.y / head_ext;                                       // Sequence ID (0-127)
-    int d2 = threadIdx.y + (blockIdx.y % head_ext) * (heads / head_ext);  // Head (0-11)
-    int d3 = threadIdx.x;                                                 // Values (groups of 4)
-
-    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
-    float4* output_vec = reinterpret_cast<float4*>(output);
-
-    float4 inputs = vals_vec[d0 * d0_stride + d1 * d1_stride + d2 * d2_stride + d3];
-    output_vec[d0 * d0_out_stride + d1 * d1_out_stride + d2 * d2_out_stride + d3] = inputs;
-}
-
-template <>
-__global__ void transform_0213<__half>(__half* output,
-                                       const __half* vals,
-                                       int hidden_dim,
-                                       int seq_length,
-                                       int heads,
-                                       int head_ext)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    int d0_stride = hidden_dim * seq_length;
-    int d1_stride = hidden_dim;
-    int d2_stride = hidden_dim / heads;
-
-    int d0_out_stride = d0_stride;
-    int d1_out_stride = d2_stride;
-    int d2_out_stride = d2_stride * seq_length;
-
-    int d0 = blockIdx.x;                                                  // Batch
-    int d1 = blockIdx.y / head_ext;                                       // Sequence ID (0-127)
-    int d2 = threadIdx.y + (blockIdx.y % head_ext) * (heads / head_ext);  // Head (0-11)
-    int d3 = threadIdx.x;                                                 // Values (groups of 4)
-
-    float4 vals_arr[1];
-
-    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
-    float4* output_vec = reinterpret_cast<float4*>(output);
-
-    vals_arr[0] = vals_vec[d0 * d0_stride + d1 * d1_stride + d2 * d2_stride + d3];
-    output_vec[d0 * d0_out_stride + d1 * d1_out_stride + d2 * d2_out_stride + d3] = vals_arr[0];
-#endif
-}
-
-template <>
-void launch_transform_0213<float>(float* output,
-                                  const float* vals,
-                                  int batch_size,
-                                  int seq_length,
-                                  int hidden_dim,
-                                  int heads,
-                                  hipStream_t stream)
-{
-    hidden_dim >>= 2;
-    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
-    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
-    dim3 grid_dim(batch_size, (seq_length * head_ext));
-
-   hipLaunchKernelGGL(( transform_0213<float>)
-        , dim3(grid_dim), dim3(block_dim), 0, stream, output, vals, hidden_dim, seq_length, heads, head_ext);
-}
-
-template <>
-void launch_transform_0213<__half>(__half* output,
-                                   const __half* vals,
-                                   int batch_size,
-                                   int seq_length,
-                                   int hidden_dim,
-                                   int heads,
-                                   hipStream_t stream)
-{
-    hidden_dim >>= 3;
-    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
-    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
-    dim3 grid_dim(batch_size, (seq_length * head_ext));
-   hipLaunchKernelGGL(( transform_0213<__half>)
-        , dim3(grid_dim), dim3(block_dim), 0, stream, output, vals, hidden_dim, seq_length, heads, head_ext);
-}
-
-// Bias add
-template <typename T>
-__global__ void bias_add_transform_0213(T* output,
-                                        const T* vals,
-                                        const T* bias,
-                                        int hidden_dim,
-                                        int seq_length,
-                                        int heads,
-                                        int head_ext);
-
-template <>
-__global__ void bias_add_transform_0213<float>(float* output,
-                                               const float* vals,
-                                               const float* bias,
-                                               int hidden_dim,
-                                               int seq_length,
-                                               int heads,
-                                               int head_ext)
-{
-    int d0_stride = hidden_dim * seq_length;
-    int d1_stride = hidden_dim;
-    int d2_stride = hidden_dim / heads;
-
-    int d0_out_stride = d0_stride;
-    int d1_out_stride = d2_stride;
-    int d2_out_stride = d2_stride * seq_length;
-
-    int d0 = blockIdx.x;                                                  // Batch
-    int d1 = blockIdx.y;                                                  // Sequence ID (0-127)
-    int cnt = blockIdx.z / head_ext;                                      // Hidden count
-    int d2 = threadIdx.y + (blockIdx.z % head_ext) * (heads / head_ext);  // Head (0-11)
-    int d3 = threadIdx.x;                                                 // Values (groups of 4)
-
-    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
-    const float4* bias_vec = reinterpret_cast<const float4*>(bias);
-    float4* output_vec = reinterpret_cast<float4*>(output);
-
-    float4 inputs = vals_vec[d0 * d0_stride * (gridDim.z / head_ext) + cnt * d1_stride +
-                             d1 * d1_stride * (gridDim.z / head_ext) + d2 * d2_stride + d3];
-    float4 biases = bias_vec[cnt * d1_stride + d2 * d2_stride + d3];
-
-    float4 outputs;
-    outputs.x = inputs.x + biases.x;
-    outputs.y = inputs.y + biases.y;
-    outputs.z = inputs.z + biases.z;
-    outputs.w = inputs.w + biases.w;
-
-    output_vec[cnt * d0_out_stride * gridDim.x + d0 * d0_out_stride + d1 * d1_out_stride +
-               d2 * d2_out_stride + d3] = outputs;
-}
-
-#define ATTN_H 3
-#define MAX_SEQ_LINE 10
-
-template <>
-__global__ void bias_add_transform_0213<__half>(__half* output,
-                                                const __half* vals,
-                                                const __half* bias,
-                                                int hidden_dim,
-                                                int seq_length,
-                                                int heads,
-                                                int head_ext)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    int d0_stride = hidden_dim * seq_length;
-    int d1_stride = hidden_dim;
-    int d2_stride = hidden_dim / heads;
-
-    int d2_out_stride = d2_stride * seq_length;
-
-    int d0 = blockIdx.x;                                                  // Batch
-    int d1 = blockIdx.y;                                                  // Sequence ID (0-127)
-    int cnt = blockIdx.z / head_ext;                                      // Hidden count
-    int d2 = threadIdx.y + (blockIdx.z % head_ext) * (heads / head_ext);  // Head (0-11)
-    int d3 = threadIdx.x;                                                 // Values (groups of 4)
-
-    float4 vals_arr;
-    float4 bias_arr;
-    float4 output_arr;
-    __half2* vals_half = reinterpret_cast<__half2*>(&vals_arr);
-    __half2* bias_half = reinterpret_cast<__half2*>(&bias_arr);
-    __half2* output_half = reinterpret_cast<__half2*>(&output_arr);
-
-    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
-    const float4* bias_vec = reinterpret_cast<const float4*>(bias);
-    float4* output_vec = reinterpret_cast<float4*>(output);
-
-    vals_vec += (d0 * d0_stride * (gridDim.z / head_ext));
-    vals_vec += (d1 * d1_stride * (gridDim.z / head_ext));
-    vals_vec += (cnt * d1_stride);
-    vals_vec += (d2 * d2_stride);
-
-    bias_vec += (cnt * d1_stride);
-    bias_vec += (d2 * d2_stride);
-
-    output_vec += (cnt * d0_stride * gridDim.x);
-    output_vec += (d1 * d2_stride);
-    output_vec += (d0 * d0_stride);
-    output_vec += (d2 * d2_out_stride);
-
-    bias_arr = bias_vec[d3];
-    vals_arr = vals_vec[d3];
-
-#if defined(__ACC_HALF__)
-    output_half[0] = vals_half[0] + bias_half[0];
-    output_half[1] = vals_half[1] + bias_half[1];
-    output_half[2] = vals_half[2] + bias_half[2];
-    output_half[3] = vals_half[3] + bias_half[3];
-#else
-    float2 bias_arr_f[4];
-    float2 vals_arr_f[4];
-#pragma unroll
-    for (int l = 0; l < 4; l++) {
-        bias_arr_f[l] = __half22float2(bias_half[l]);
-        vals_arr_f[l] = __half22float2(vals_half[l]);
-        vals_arr_f[l].x += bias_arr_f[l].x;
-        vals_arr_f[l].y += bias_arr_f[l].y;
-        output_half[l] = __float22half2_rn(vals_arr_f[l]);
-    }
-#endif
-    output_vec[d3] = output_arr;
-
-#endif
-}
-
-__global__ void bias_add_transform_0213_v2(__half* output,
-                                           const __half* vals,
-                                           const __half* bias,
-                                           int hidden_dim,
-                                           int seq_length,
-                                           int heads)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    __shared__ float4 in_data[3072];
-
-    int d0_stride = hidden_dim * seq_length;
-    int d1_stride = hidden_dim;
-    int d2_stride = hidden_dim / heads;
-    int iteration_stride = d1_stride * blockDim.z;  // Hidden * 3 / 8
-    int batch_stride = d0_stride * blockDim.z;      // Hidden * S * 3 / 8
-
-    int d0_out_stride = d0_stride;
-    int d1_out_stride = d2_stride;
-    int d2_out_stride = d2_stride * seq_length;
-
-    int d0 = blockIdx.x;    // Batch
-    int d1 = blockIdx.y;    // Sequence ID (0-127)
-    int cnt = threadIdx.z;  // blockIdx.z; // Hidden count
-    int d2 = threadIdx.y;   // Head (0-11)
-    int d3 = threadIdx.x;   // Values (groups of 4)
-
-    float4 vals_arr[1];
-    float4 bias_arr[1];
-    float4 output_arr[1];
-    __half2* vals_half = reinterpret_cast<__half2*>(vals_arr);
-    __half2* bias_half = reinterpret_cast<__half2*>(bias_arr);
-    __half2* output_half = reinterpret_cast<__half2*>(output_arr);
-
-    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
-    const float4* bias_vec = reinterpret_cast<const float4*>(bias);
-    float4* output_vec = reinterpret_cast<float4*>(output);
-
-    int iter_index = cnt * d1_stride + d2 * d2_stride + d3;
-    int input_offset = d0 * batch_stride + d1 * (iteration_stride << 1);
-    bias_arr[0] = bias_vec[iter_index];
-
-#pragma unroll
-    for (int iter = 0; iter < 2; iter++) {
-        int iter_id = iter * iteration_stride + iter_index;
-        vals_arr[0] = vals_vec[input_offset + iter_id];
-
-        output_half[0] = vals_half[0] + bias_half[0];
-        output_half[1] = vals_half[1] + bias_half[1];
-        output_half[2] = vals_half[2] + bias_half[2];
-        output_half[3] = vals_half[3] + bias_half[3];
-
-        in_data[iter_id] = output_arr[0];
-    }
-    __syncthreads();
-
-    iteration_stride = blockDim.z * (blockDim.y >> 1);
-    int matrix_stride = (d0_out_stride * gridDim.x);
-    int head_count = (d2 >> 1) + cnt * (blockDim.y >> 1);
-
-    int out_index = d0 * d0_out_stride + d1 * (d1_out_stride << 1) + d3 + (d2 % 2) * d2_stride;
-
-#pragma unroll
-    for (int iter = 0; iter < 2; iter++) {
-        int iter_row = (iter * iteration_stride) + head_count;
-        int iter_offset =
-            (iter_row % blockDim.y) * d2_out_stride + (iter_row / blockDim.y) * matrix_stride;
-        output_vec[out_index + iter_offset] =
-            in_data[iter_row * d2_stride + d3 + (d2 % 2) * (d1_stride * blockDim.z)];
-    }
-#endif
-}
-
-// [B S C*H] - > C * [B A S N]
-template <>
-void launch_bias_add_transform_0213<float>(float* output,
-                                           const float* vals,
-                                           const float* bias,
-                                           int batch_size,
-                                           int seq_length,
-                                           int hidden_dim,
-                                           int heads,
-                                           hipStream_t stream,
-                                           int trans_count)
-{
-    hidden_dim >>= 2;
-    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
-
-    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
-    dim3 grid_dim(batch_size, seq_length, (trans_count * head_ext));
-
-   hipLaunchKernelGGL(( bias_add_transform_0213<float>), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        output, vals, bias, hidden_dim, seq_length, heads, head_ext);
-}
-
-template <>
-void launch_bias_add_transform_0213<__half>(__half* output,
-                                            const __half* vals,
-                                            const __half* bias,
-                                            int batch_size,
-                                            int seq_length,
-                                            int hidden_dim,
-                                            int heads,
-                                            hipStream_t stream,
-                                            int trans_count)
-{
-    hidden_dim >>= 3;
-    if (hidden_dim > 128 || hidden_dim < 16) {
-        int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
-        dim3 block_dim(hidden_dim / heads, (heads / head_ext));
-        dim3 grid_dim(batch_size, seq_length, (trans_count * head_ext));
-       hipLaunchKernelGGL(( bias_add_transform_0213<__half>), dim3(grid_dim), dim3(block_dim), 0, stream, 
-            output, vals, bias, hidden_dim, seq_length, heads, head_ext);
-    } else {
-        dim3 block_dim(hidden_dim / heads, heads, trans_count);
-        dim3 grid_dim(batch_size, seq_length / 2);
-       hipLaunchKernelGGL(( bias_add_transform_0213_v2), dim3(grid_dim), dim3(block_dim), 0, stream, 
-            output, vals, bias, hidden_dim, seq_length, heads);
-    }
-}
-
-template <typename T>
-__global__ void transform4d_0213(T* out,
-                                 const T* in,
-                                 int heads,
-                                 int seq_length,
-                                 int hidden_dim,
-                                 int head_ext);
-
-template <>
-__global__ void transform4d_0213<float>(float* out,
-                                        const float* in,
-                                        int heads,
-                                        int seq_length,
-                                        int hidden_dim,
-                                        int head_ext)
-{
-    int d0_stride = hidden_dim * seq_length;
-    int d1_stride = d0_stride / heads;
-    int d2_stride = hidden_dim / heads;
-
-    int d0_out_stride = d0_stride;
-    int d1_out_stride = d2_stride;
-    int d2_out_stride = hidden_dim;
-
-    int d0 = blockIdx.x;                                        // Batch
-    int d1 = blockIdx.y / ((seq_length - 1) / blockDim.y + 1);  // Head
-    int d2 = (threadIdx.y + blockDim.y * blockIdx.y) % seq_length;
-    int cnt = blockIdx.z;
-    int d3 = threadIdx.x;  // Values (groups of 8)
-
-    if (d2 < seq_length) {
-        const float4* in_vec = reinterpret_cast<const float4*>(in);
-        float4* out_vec = reinterpret_cast<float4*>(out);
-
-        float4 vals_vec = in_vec[cnt * d0_stride * gridDim.x + d0 * d0_stride + d1 * d1_stride +
-                                 d2 * d2_stride + d3];
-        out_vec[d0 * d0_out_stride * gridDim.z + cnt * d2_out_stride + d1 * d1_out_stride +
-                d2 * d2_out_stride * gridDim.z + d3] = vals_vec;
-    }
-}
-
-template <>
-__global__ void transform4d_0213<__half>(__half* out,
-                                         const __half* in,
-                                         int heads,
-                                         int seq_length,
-                                         int hidden_dim,
-                                         int head_ext)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    int d0_stride = hidden_dim * (seq_length / head_ext);
-    int d1_stride = hidden_dim;
-    int d2_stride = hidden_dim / heads;
-
-    int d0 = blockIdx.x;                                                  // Batch
-    int d1 = threadIdx.y + (blockIdx.z % head_ext) * (heads / head_ext);  // Head
-    int d2 = blockIdx.z / head_ext;                                       // Sequence
-    int cnt = blockIdx.y;                                                 // Hidden count
-    int d3 = threadIdx.x;                                                 // Values (groups of 8)
-
-    const float4* in_vec = reinterpret_cast<const float4*>(in);
-    float4* out_vec = reinterpret_cast<float4*>(out);
-
-    in_vec += (cnt * d0_stride * gridDim.x);
-    in_vec += (d0 * d0_stride);
-    in_vec += (d2 * d2_stride);
-    in_vec += (d1 * d2_stride * seq_length);
-
-    out_vec += (cnt * d1_stride);
-    out_vec += (d1 * d2_stride);
-    out_vec += (d0 * d0_stride * gridDim.y);
-    out_vec += (d2 * d1_stride * gridDim.y);
-
-    out_vec[d3] = in_vec[d3];
-
-#endif
-}
-
-__global__ void transform4d_0213_v2(__half* out,
-                                    const __half* in,
-                                    int heads,
-                                    int seq_length,
-                                    int hidden_dim)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    __shared__ float4 in_data[3072];
-
-    int d0_stride = hidden_dim * seq_length;
-    int d1_stride = hidden_dim;
-    int d2_stride = hidden_dim / heads;
-
-    int d0 = blockIdx.x;    // Batch
-    int d1 = threadIdx.y;   // Head
-    int d2 = blockIdx.y;    // Sequence
-    int cnt = threadIdx.z;  // Hidden count
-    int d3 = threadIdx.x;   // Values (groups of 8)
-
-    const float4* in_vec = reinterpret_cast<const float4*>(in);
-    float4* out_vec = reinterpret_cast<float4*>(out);
-
-    int input_offset = d0 * d0_stride + d2 * (d2_stride << 1) + d3 + (d1 % 2) * d2_stride;
-    int head_count = (d1 >> 1) + cnt * (blockDim.y >> 1);
-    int iteration_stride = blockDim.z * (blockDim.y >> 1);
-    int matrix_stride = (d0_stride * gridDim.x);
-
-#pragma unroll
-    for (int iter = 0; iter < 2; iter++) {
-        int iter_row = iter * iteration_stride + head_count;
-        int iter_offset = (iter_row % blockDim.y) * d2_stride;
-
-        in_data[d3 + iter_offset + (iter_row / blockDim.y + (d1 % 2) * blockDim.z) * d1_stride] =
-            in_vec[input_offset + iter_offset * seq_length +
-                   (iter_row / blockDim.y) * matrix_stride];
-    }
-    __syncthreads();
-
-    iteration_stride = d1_stride * blockDim.z;
-    int iter_index = cnt * d1_stride + d1 * d2_stride + d3;
-    int output_offset = d0 * d0_stride * blockDim.z + d2 * (iteration_stride << 1);
-
-#pragma unroll
-    for (int iter = 0; iter < 2; iter++) {
-        int iter_id = iter * iteration_stride + iter_index;
-        out_vec[output_offset + iter_id] = in_data[iter_id];
-    }
-#endif
-}
-
-// 3 * [B A S N] - > [B S C*H]
-template <>
-void launch_transform4d_0213<float>(float* out,
-                                    const float* in,
-                                    int batch_size,
-                                    int heads,
-                                    int seq_length,
-                                    int hidden_dim,
-                                    hipStream_t stream,
-                                    int trans_count)
-{
-    hidden_dim >>= 2;
-    dim3 grid_dims(batch_size, heads * ((seq_length - 1) / 8 + 1), trans_count);
-    dim3 block_dims(hidden_dim / heads, 8);
-   hipLaunchKernelGGL(( transform4d_0213<float>)
-        , dim3(grid_dims), dim3(block_dims), 0, stream, out, in, heads, seq_length, hidden_dim, 1);
-}
-
-template <>
-void launch_transform4d_0213<__half>(__half* out,
-                                     const __half* in,
-                                     int batch_size,
-                                     int heads,
-                                     int seq_length,
-                                     int hidden_dim,
-                                     hipStream_t stream,
-                                     int trans_count)
-{
-    hidden_dim >>= 3;
-    if (hidden_dim > 128 || hidden_dim < 16) {
-        int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
-        dim3 grid_dims(batch_size, trans_count, (seq_length * head_ext));
-        dim3 block_dims(hidden_dim / heads, (heads / head_ext));
-       hipLaunchKernelGGL(( transform4d_0213<__half>), dim3(grid_dims), dim3(block_dims), 0, stream, 
-            out, in, heads, seq_length, hidden_dim, head_ext);
-    } else {
-        dim3 grid_dims(batch_size, seq_length / 2);
-        dim3 block_dims(hidden_dim / heads, heads, trans_count);
-       hipLaunchKernelGGL(( transform4d_0213_v2), dim3(grid_dims), dim3(block_dims), 0, stream, 
-            out, in, heads, seq_length, hidden_dim);
-    }
-}
diff --git a/deepspeed/ops/csrc/transformer_bak/cublas_wrappers.cu b/deepspeed/ops/csrc/transformer_bak/cublas_wrappers.cu
deleted file mode 100644
index 75ecd3fb4ef9d5d63d9c7681bdce0cf949641b5d..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/cublas_wrappers.cu
+++ /dev/null
@@ -1,403 +0,0 @@
-#include "cublas_wrappers.h"
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const float* A,
-                   const float* B,
-                   float* C,
-                   rocblas_gemm_algo algo)
-#else
-int cublas_gemm_ex(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const float* A,
-                   const float* B,
-                   float* C,
-                   cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status = rocblas_gemm_ex(handle,
-                                            transa,
-                                            transb,
-                                            m,
-                                            n,
-                                            k,
-                                            (const void*)alpha,
-                                            (const void*)A,
-                                            rocblas_datatype_f32_r,
-                                            (transa == rocblas_operation_none) ? m : k,
-                                            (const void*)B,
-                                            rocblas_datatype_f32_r,
-                                            (transb == rocblas_operation_none) ? k : n,
-                                            (const void*)beta,
-                                            C,
-                                            rocblas_datatype_f32_r,
-                                            m,
-                                            C,
-                                            rocblas_datatype_f32_r,
-                                            m,
-                                            rocblas_datatype_f32_r,
-                                            algo,
-                                            0,
-                                            0);
-#else
-    cublasStatus_t status = cublasGemmEx(handle,
-                                         transa,
-                                         transb,
-                                         m,
-                                         n,
-                                         k,
-                                         (const void*)alpha,
-                                         (const void*)A,
-                                         CUDA_R_32F,
-                                         (transa == CUBLAS_OP_N) ? m : k,
-                                         (const void*)B,
-                                         CUDA_R_32F,
-                                         (transb == CUBLAS_OP_N) ? k : n,
-                                         (const void*)beta,
-                                         C,
-                                         CUDA_R_32F,
-                                         m,
-                                         CUDA_R_32F,
-                                         algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != CUBLAS_STATUS_SUCCESS) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-                   rocblas_gemm_algo algo)
-#else
-int cublas_gemm_ex(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-                   cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status = rocblas_gemm_ex(handle,
-                                            transa,
-                                            transb,
-                                            m,
-                                            n,
-                                            k,
-                                            (const void*)alpha,
-                                            (const void*)A,
-                                            rocblas_datatype_f16_r,
-                                            (transa == rocblas_operation_none) ? m : k,
-                                            (const void*)B,
-                                            rocblas_datatype_f16_r,
-                                            (transb == rocblas_operation_none) ? k : n,
-                                            (const void*)beta,
-                                            (void*)C,
-                                            rocblas_datatype_f16_r,
-                                            m,
-                                            (void*)C,
-                                            rocblas_datatype_f16_r,
-                                            m,
-                                            rocblas_datatype_f32_r,
-                                            algo,
-                                            0,
-                                            0);
-#else
-    cublasStatus_t status = cublasGemmEx(handle,
-                                         transa,
-                                         transb,
-                                         m,
-                                         n,
-                                         k,
-                                         (const void*)alpha,
-                                         (const void*)A,
-                                         CUDA_R_16F,
-                                         (transa == CUBLAS_OP_N) ? m : k,
-                                         (const void*)B,
-                                         CUDA_R_16F,
-                                         (transb == CUBLAS_OP_N) ? k : n,
-                                         (const void*)beta,
-                                         (void*)C,
-                                         CUDA_R_16F,
-                                         m,
-                                         CUDA_R_32F,
-                                         algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != CUBLAS_STATUS_SUCCESS) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const float* A,
-                                const float* B,
-                                float* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                rocblas_gemm_algo algo)
-#else
-int cublas_strided_batched_gemm(cublasHandle_t handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const float* A,
-                                const float* B,
-                                float* C,
-                                cublasOperation_t op_A,
-                                cublasOperation_t op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status =
-        rocblas_gemm_strided_batched_ex(handle,
-                                        op_A,
-                                        op_B,
-                                        m,
-                                        n,
-                                        k,
-                                        alpha,
-                                        A,
-                                        rocblas_datatype_f32_r,
-                                        (op_A == rocblas_operation_none) ? m : k,
-                                        stride_A,
-                                        B,
-                                        rocblas_datatype_f32_r,
-                                        (op_B == rocblas_operation_none) ? k : n,
-                                        stride_B,
-                                        beta,
-                                        C,
-                                        rocblas_datatype_f32_r,
-                                        m,
-                                        stride_C,
-                                        C,
-                                        rocblas_datatype_f32_r,
-                                        m,
-                                        stride_C,
-                                        batch,
-                                        rocblas_datatype_f32_r,
-                                        algo,
-                                        0,
-                                        0);
-#else
-    cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
-                                                       op_A,
-                                                       op_B,
-                                                       m,
-                                                       n,
-                                                       k,
-                                                       alpha,
-                                                       A,
-                                                       CUDA_R_32F,
-                                                       (op_A == CUBLAS_OP_N) ? m : k,
-                                                       stride_A,
-                                                       B,
-                                                       CUDA_R_32F,
-                                                       (op_B == CUBLAS_OP_N) ? k : n,
-                                                       stride_B,
-                                                       beta,
-                                                       C,
-                                                       CUDA_R_32F,
-                                                       m,
-                                                       stride_C,
-                                                       batch,
-                                                       CUDA_R_32F,
-                                                       algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != CUBLAS_STATUS_SUCCESS) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d) \n",
-                batch,
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                rocblas_gemm_algo algo)
-#else
-int cublas_strided_batched_gemm(cublasHandle_t handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
-                                cublasOperation_t op_A,
-                                cublasOperation_t op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status =
-        rocblas_gemm_strided_batched_ex(handle,
-                                        op_A,
-                                        op_B,
-                                        m,
-                                        n,
-                                        k,
-                                        alpha,
-                                        A,
-                                        rocblas_datatype_f16_r,
-                                        (op_A == rocblas_operation_none) ? m : k,
-                                        stride_A,
-                                        B,
-                                        rocblas_datatype_f16_r,
-                                        (op_B == rocblas_operation_none) ? k : n,
-                                        stride_B,
-                                        beta,
-                                        C,
-                                        rocblas_datatype_f16_r,
-                                        m,
-                                        stride_C,
-                                        C,
-                                        rocblas_datatype_f16_r,
-                                        m,
-                                        stride_C,
-                                        batch,
-                                        rocblas_datatype_f32_r,
-                                        algo,
-                                        0,
-                                        0);
-#else
-    cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
-                                                       op_A,
-                                                       op_B,
-                                                       m,
-                                                       n,
-                                                       k,
-                                                       alpha,
-                                                       A,
-                                                       CUDA_R_16F,
-                                                       (op_A == CUBLAS_OP_N) ? m : k,
-                                                       stride_A,
-                                                       B,
-                                                       CUDA_R_16F,
-                                                       (op_B == CUBLAS_OP_N) ? k : n,
-                                                       stride_B,
-                                                       beta,
-                                                       C,
-                                                       CUDA_R_16F,
-                                                       m,
-                                                       stride_C,
-                                                       batch,
-                                                       CUDA_R_32F,
-                                                       algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != CUBLAS_STATUS_SUCCESS) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-
-    return 0;
-}
diff --git a/deepspeed/ops/csrc/transformer_bak/cublas_wrappers.hip b/deepspeed/ops/csrc/transformer_bak/cublas_wrappers.hip
deleted file mode 100644
index 04aa0ef0a7d083a50fc7d4ec8f01b24e2ccd52e8..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/cublas_wrappers.hip
+++ /dev/null
@@ -1,404 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "cublas_wrappers_hip.h"
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const float* A,
-                   const float* B,
-                   float* C,
-                   rocblas_gemm_algo algo)
-#else
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const float* A,
-                   const float* B,
-                   float* C,
-                   cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status = rocblas_gemm_ex(handle,
-                                            transa,
-                                            transb,
-                                            m,
-                                            n,
-                                            k,
-                                            (const void*)alpha,
-                                            (const void*)A,
-                                            rocblas_datatype_f32_r,
-                                            (transa == rocblas_operation_none) ? m : k,
-                                            (const void*)B,
-                                            rocblas_datatype_f32_r,
-                                            (transb == rocblas_operation_none) ? k : n,
-                                            (const void*)beta,
-                                            C,
-                                            rocblas_datatype_f32_r,
-                                            m,
-                                            C,
-                                            rocblas_datatype_f32_r,
-                                            m,
-                                            rocblas_datatype_f32_r,
-                                            algo,
-                                            0,
-                                            0);
-#else
-    rocblas_status status = rocblas_gemmex(handle,
-                                         transa,
-                                         transb,
-                                         m,
-                                         n,
-                                         k,
-                                         (const void*)alpha,
-                                         (const void*)A,
-                                         hipR32F,
-                                         (transa == rocblas_operation_none) ? m : k,
-                                         (const void*)B,
-                                         hipR32F,
-                                         (transb == rocblas_operation_none) ? k : n,
-                                         (const void*)beta,
-                                         C,
-                                         hipR32F,
-                                         m,
-                                         hipR32F,
-                                         algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != rocblas_status_success) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-                   rocblas_gemm_algo algo)
-#else
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-                   cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status = rocblas_gemm_ex(handle,
-                                            transa,
-                                            transb,
-                                            m,
-                                            n,
-                                            k,
-                                            (const void*)alpha,
-                                            (const void*)A,
-                                            rocblas_datatype_f16_r,
-                                            (transa == rocblas_operation_none) ? m : k,
-                                            (const void*)B,
-                                            rocblas_datatype_f16_r,
-                                            (transb == rocblas_operation_none) ? k : n,
-                                            (const void*)beta,
-                                            (void*)C,
-                                            rocblas_datatype_f16_r,
-                                            m,
-                                            (void*)C,
-                                            rocblas_datatype_f16_r,
-                                            m,
-                                            rocblas_datatype_f32_r,
-                                            algo,
-                                            0,
-                                            0);
-#else
-    rocblas_status status = rocblas_gemmex(handle,
-                                         transa,
-                                         transb,
-                                         m,
-                                         n,
-                                         k,
-                                         (const void*)alpha,
-                                         (const void*)A,
-                                         hipR16F,
-                                         (transa == rocblas_operation_none) ? m : k,
-                                         (const void*)B,
-                                         hipR16F,
-                                         (transb == rocblas_operation_none) ? k : n,
-                                         (const void*)beta,
-                                         (void*)C,
-                                         hipR16F,
-                                         m,
-                                         hipR32F,
-                                         algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != rocblas_status_success) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const float* A,
-                                const float* B,
-                                float* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                rocblas_gemm_algo algo)
-#else
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const float* A,
-                                const float* B,
-                                float* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status =
-        rocblas_gemm_strided_batched_ex(handle,
-                                        op_A,
-                                        op_B,
-                                        m,
-                                        n,
-                                        k,
-                                        alpha,
-                                        A,
-                                        rocblas_datatype_f32_r,
-                                        (op_A == rocblas_operation_none) ? m : k,
-                                        stride_A,
-                                        B,
-                                        rocblas_datatype_f32_r,
-                                        (op_B == rocblas_operation_none) ? k : n,
-                                        stride_B,
-                                        beta,
-                                        C,
-                                        rocblas_datatype_f32_r,
-                                        m,
-                                        stride_C,
-                                        C,
-                                        rocblas_datatype_f32_r,
-                                        m,
-                                        stride_C,
-                                        batch,
-                                        rocblas_datatype_f32_r,
-                                        algo,
-                                        0,
-                                        0);
-#else
-    rocblas_status status = cublasGemmStridedBatchedEx(handle,
-                                                       op_A,
-                                                       op_B,
-                                                       m,
-                                                       n,
-                                                       k,
-                                                       alpha,
-                                                       A,
-                                                       hipR32F,
-                                                       (op_A == rocblas_operation_none) ? m : k,
-                                                       stride_A,
-                                                       B,
-                                                       hipR32F,
-                                                       (op_B == rocblas_operation_none) ? k : n,
-                                                       stride_B,
-                                                       beta,
-                                                       C,
-                                                       hipR32F,
-                                                       m,
-                                                       stride_C,
-                                                       batch,
-                                                       hipR32F,
-                                                       algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != rocblas_status_success) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d) \n",
-                batch,
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                rocblas_gemm_algo algo)
-#else
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status =
-        rocblas_gemm_strided_batched_ex(handle,
-                                        op_A,
-                                        op_B,
-                                        m,
-                                        n,
-                                        k,
-                                        alpha,
-                                        A,
-                                        rocblas_datatype_f16_r,
-                                        (op_A == rocblas_operation_none) ? m : k,
-                                        stride_A,
-                                        B,
-                                        rocblas_datatype_f16_r,
-                                        (op_B == rocblas_operation_none) ? k : n,
-                                        stride_B,
-                                        beta,
-                                        C,
-                                        rocblas_datatype_f16_r,
-                                        m,
-                                        stride_C,
-                                        C,
-                                        rocblas_datatype_f16_r,
-                                        m,
-                                        stride_C,
-                                        batch,
-                                        rocblas_datatype_f32_r,
-                                        algo,
-                                        0,
-                                        0);
-#else
-    rocblas_status status = cublasGemmStridedBatchedEx(handle,
-                                                       op_A,
-                                                       op_B,
-                                                       m,
-                                                       n,
-                                                       k,
-                                                       alpha,
-                                                       A,
-                                                       hipR16F,
-                                                       (op_A == rocblas_operation_none) ? m : k,
-                                                       stride_A,
-                                                       B,
-                                                       hipR16F,
-                                                       (op_B == rocblas_operation_none) ? k : n,
-                                                       stride_B,
-                                                       beta,
-                                                       C,
-                                                       hipR16F,
-                                                       m,
-                                                       stride_C,
-                                                       batch,
-                                                       hipR32F,
-                                                       algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != rocblas_status_success) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-
-    return 0;
-}
diff --git a/deepspeed/ops/csrc/transformer_bak/dropout_kernels.cu b/deepspeed/ops/csrc/transformer_bak/dropout_kernels.cu
deleted file mode 100644
index d1ba135f4900f8eff3f6b4cab70d1b35b39f7833..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/dropout_kernels.cu
+++ /dev/null
@@ -1,868 +0,0 @@
-#include "custom_cuda_layers.h"
-
-const int unroll_factor = 4;
-
-__global__ void dropout_kernel(const int N,
-                               const float ratio,
-                               float* out,
-                               const float* Xdata,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    curandStatePhilox4_32_10_t state;
-    curand_init(seed.first, idx, seed.second, &state);
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        float4 rand = curand_uniform4(&state);
-        uint8_t m[unroll_factor];
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        int i = j * unroll_factor;
-
-        mask[i] = (uint8_t)m[0];
-        mask[i + 1] = (uint8_t)m[1];
-        mask[i + 2] = (uint8_t)m[2];
-        mask[i + 3] = (uint8_t)m[3];
-
-        out[i] = Xdata[i] * scale * m[0];
-        out[i + 1] = Xdata[i + 1] * scale * m[1];
-        out[i + 2] = Xdata[i + 2] * scale * m[2];
-        out[i + 3] = Xdata[i + 3] * scale * m[3];
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = curand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            out[i] = Xdata[i] * scale * m;
-            mask[i] = m;
-        }
-    }
-}
-
-__global__ void dropout_kernel(const int N,
-                               const float ratio,
-                               __half* out,
-                               const __half* Xdata,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    curandStatePhilox4_32_10_t state;
-    curand_init(seed.first, idx, seed.second, &state);
-
-#ifdef __STOCHASTIC_MODE__
-
-    const __half2 h_scale = __float2half2_rn(scale);
-    const float2* x_cast = reinterpret_cast<const float2*>(Xdata);
-    float2* out_cast = reinterpret_cast<float2*>(out);
-    uint32_t* mask_cast = reinterpret_cast<uint32_t*>(mask);
-
-    uint32_t m_32;
-    uint8_t* m = reinterpret_cast<uint8_t*>(&m_32);
-
-    float2 result_f;
-    __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-    __half2 mask_h[2];
-    float2 mask_f[2];
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        float2 x_f = x_cast[j];
-        __half2* x_h = reinterpret_cast<__half2*>(&x_f);
-
-        float4 rand = curand_uniform4(&state);
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        float* mask_f_data = &mask_f[0].x;
-#pragma unroll
-        for (int i = 0; i < unroll_factor; i++) mask_f_data[i] = (float)(m[i]);
-
-        mask_h[0] = __float22half2_rn(mask_f[0]);
-        mask_h[1] = __float22half2_rn(mask_f[1]);
-
-        result_h[0] = x_h[0] * h_scale * mask_h[0];
-        result_h[1] = x_h[1] * h_scale * mask_h[1];
-
-        out_cast[j] = result_f;
-
-        mask_cast[j] = m_32;
-    }
-
-#else
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        int i = j * unroll_factor;
-
-        const __half2* vals_half = reinterpret_cast<const __half2*>(Xdata + i);
-        float2 vals_half_f[2];
-        vals_half_f[0] = __half22float2(vals_half[0]);
-        vals_half_f[1] = __half22float2(vals_half[1]);
-
-        uint8_t m[unroll_factor];
-        float4 rand = curand_uniform4(&state);
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        out[i] = __float2half(vals_half_f[0].x * scale * m[0]);
-        out[i + 1] = __float2half(vals_half_f[0].y * scale * m[1]);
-        out[i + 2] = __float2half(vals_half_f[1].x * scale * m[2]);
-        out[i + 3] = __float2half(vals_half_f[1].y * scale * m[3]);
-
-        mask[i] = m[0];
-        mask[i + 1] = m[1];
-        mask[i + 2] = m[2];
-        mask[i + 3] = m[3];
-    }
-
-#endif
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = curand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            out[i] = __float2half((float)Xdata[i] * scale * m);
-            mask[i] = m;
-        }
-    }
-}
-
-__global__ void dropout_kernel_bwd(const int N,
-                                   const float ratio,
-                                   const float* Xdata,
-                                   float* out,
-                                   uint8_t* mask,
-                                   std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        int i = j * unroll_factor;
-
-        out[i] = mask[i] ? Xdata[i] * scale : 0.0;
-        out[i + 1] = mask[i + 1] ? Xdata[i + 1] * scale : 0.0;
-        out[i + 2] = mask[i + 2] ? Xdata[i + 2] * scale : 0.0;
-        out[i + 3] = mask[i + 3] ? Xdata[i + 3] * scale : 0.0;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        for (int i = high_index; i < N; i++) { out[i] = mask[i] ? Xdata[i] * scale : 0.0; }
-    }
-}
-
-__global__ void dropout_kernel_bwd(const int N,
-                                   const float ratio,
-                                   const __half* Xdata,
-                                   __half* out,
-                                   uint8_t* mask,
-                                   std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-
-#ifdef __STOCHASTIC_MODE__
-
-    const __half2 h_scale = __float2half2_rn(scale);
-
-    const float2* x_cast = reinterpret_cast<const float2*>(Xdata);
-    float2* out_cast = reinterpret_cast<float2*>(out);
-    uint32_t* mask_cast = reinterpret_cast<uint32_t*>(mask);
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        float2 x_f = x_cast[j];
-        __half2* x_h = reinterpret_cast<__half2*>(&x_f);
-
-        uint32_t m_32 = mask_cast[j];
-        uint8_t* m = (uint8_t*)&m_32;
-
-        __half2 mask_h[2];
-        float2 mask_f[2];
-
-        float* mask_f_data = &mask_f[0].x;
-#pragma unroll
-        for (int i = 0; i < unroll_factor; i++) mask_f_data[i] = (float)(m[i]);
-
-#pragma unroll
-        for (int i = 0; i < 2; i++) mask_h[i] = __float22half2_rn(mask_f[i]);
-
-        float2 result_f;
-        __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-        result_h[0] = x_h[0] * h_scale * mask_h[0];
-        result_h[1] = x_h[1] * h_scale * mask_h[1];
-
-        out_cast[j] = result_f;
-    }
-
-#else
-
-    const __half h_scale = __float2half(scale);
-    const __half h_zero = __float2half(0.0);
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        int i = j * unroll_factor;
-
-        const __half2* vals_half = reinterpret_cast<const __half2*>(Xdata + i);
-
-        uint8_t* m = mask + i;
-
-        float2 vals_half_f[2];
-
-        vals_half_f[0] = __half22float2(vals_half[0]);
-        vals_half_f[1] = __half22float2(vals_half[1]);
-
-        out[i] = __float2half(vals_half_f[0].x * scale * m[0]);
-        out[i + 1] = __float2half(vals_half_f[0].y * scale * m[1]);
-        out[i + 2] = __float2half(vals_half_f[1].x * scale * m[2]);
-        out[i + 3] = __float2half(vals_half_f[1].y * scale * m[3]);
-    }
-
-#endif
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        for (int i = high_index; i < N; i++) {
-            out[i] = __float2half((float)Xdata[i] * scale * mask[i]);
-        }
-    }
-}
-
-template <typename T>
-void launch_dropout(T* out,
-                    const T* vals,
-                    uint8_t* mask,
-                    int total_count,
-                    int dim,
-                    float ratio,
-                    cudaStream_t stream,
-                    bool bwd)
-{
-    assert(unroll_factor == 4);
-
-    dim3 grid_dim = DS_GET_BLOCKS(total_count / unroll_factor);
-    dim3 block_dim = DS_CUDA_NUM_THREADS;
-
-    if (dim > 512) {
-        block_dim.x >>= 1;
-        grid_dim.x <<= 1;
-    }
-    uint64_t inc = total_count / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
-    if (bwd)
-        dropout_kernel_bwd<<<grid_dim, block_dim, 0, stream>>>(
-            total_count, ratio, vals, out, mask, seed);
-    else
-        dropout_kernel<<<grid_dim, block_dim, 0, stream>>>(
-            total_count, ratio, out, vals, mask, seed);
-}
-
-template void launch_dropout(float* out,
-                             const float* vals,
-                             uint8_t* mask,
-                             int total_count,
-                             int dim,
-                             float ratio,
-                             cudaStream_t stream,
-                             bool);
-template void launch_dropout(__half* out,
-                             const __half* vals,
-                             uint8_t* mask,
-                             int total_count,
-                             int dim,
-                             float ratio,
-                             cudaStream_t stream,
-                             bool);
-
-__global__ void dropout_grad_kernel(const int N, const float scale, float* Xdata, uint8_t* mask)
-{
-    CUDA_1D_KERNEL_LOOP(i, N) { Xdata[i] *= scale * mask[i]; }
-}
-
-__global__ void dropout_grad_kernel(const int N, const float scale, __half* Xdata, uint8_t* mask)
-{
-    const __half2 h_scale = __float2half2_rn(scale);
-    float2* x_cast = reinterpret_cast<float2*>(Xdata);
-    uint32_t* mask_cast = reinterpret_cast<uint32_t*>(mask);
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        float2 x_data = x_cast[j];
-        uint32_t m_32 = mask_cast[j];
-        uint8_t* m = (uint8_t*)&m_32;
-
-        float2 result_f;
-        __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-#ifdef __STOCHASTIC_MODE__
-
-        __half2* x_data_h = reinterpret_cast<__half2*>(&x_data);
-        __half2 mask_h[2];
-        float2 mask_f[2];
-
-        float* mask_f_data = &mask_f[0].x;
-#pragma unroll
-        for (int i = 0; i < unroll_factor; i++) *(mask_f_data++) = (float)(m[i]);
-
-        mask_h[0] = __float22half2_rn(mask_f[0]);
-        mask_h[1] = __float22half2_rn(mask_f[1]);
-
-        result_h[0] = x_data_h[0] * h_scale * mask_h[0];
-        result_h[1] = x_data_h[1] * h_scale * mask_h[1];
-
-#else
-
-        __half* x_data_h = reinterpret_cast<__half*>(&x_data);
-        float2 result[2];
-
-        result[0].x = (float)x_data_h[0] * scale * m[0];
-        result[0].y = (float)x_data_h[1] * scale * m[1];
-        result[1].x = (float)x_data_h[2] * scale * m[2];
-        result[1].y = (float)x_data_h[3] * scale * m[3];
-
-        result_h[0] = __float22half2_rn(result[0]);
-        result_h[1] = __float22half2_rn(result[1]);
-
-#endif
-        x_cast[j] = result_f;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        for (int i = high_index; i < N; i++) {
-            Xdata[i] = __float2half((float)Xdata[i] * scale * mask[i]);
-        }
-    }
-}
-
-template <typename T>
-void launch_dropout_grad(T* vals, uint8_t* mask, int total_count, float ratio, cudaStream_t stream)
-{
-    assert(unroll_factor == 4);
-
-    const float scale = 1. / (1. - ratio);
-    dropout_grad_kernel<<<DS_GET_BLOCKS(total_count / unroll_factor),
-                          DS_CUDA_NUM_THREADS,
-                          0,
-                          stream>>>(total_count, scale, vals, mask);
-}
-
-template void launch_dropout_grad(float* vals,
-                                  uint8_t* mask,
-                                  int total_count,
-                                  float ratio,
-                                  cudaStream_t stream);
-template void launch_dropout_grad(__half* vals,
-                                  uint8_t* mask,
-                                  int total_count,
-                                  float ratio,
-                                  cudaStream_t stream);
-
-__global__ void dropout_grad_kernel(const int N,
-                                    const float scale,
-                                    const float* Xdata,
-                                    float* out,
-                                    uint8_t* mask)
-{
-    CUDA_1D_KERNEL_LOOP(i, N) { out[i] = Xdata[i] * scale * mask[i]; }
-}
-
-__global__ void dropout_grad_kernel(const int N,
-                                    const float scale,
-                                    const __half* Xdata,
-                                    __half* out,
-                                    uint8_t* mask)
-{
-    const float2* x_cast = reinterpret_cast<const float2*>(Xdata);
-    float2* out_cast = reinterpret_cast<float2*>(out);
-    const uint32_t* mask_cast = reinterpret_cast<const uint32_t*>(mask);
-
-    float2 result_f;
-    __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        float2 x_data = x_cast[j];
-        uint32_t m_32 = mask_cast[j];
-        uint8_t* m = (uint8_t*)&m_32;
-
-        __half* x_data_h = reinterpret_cast<__half*>(&x_data);
-        float2 result[2];
-
-        result[0].x = (float)x_data_h[0] * scale * m[0];
-        result[0].y = (float)x_data_h[1] * scale * m[1];
-        result[1].x = (float)x_data_h[2] * scale * m[2];
-        result[1].y = (float)x_data_h[3] * scale * m[3];
-
-        result_h[0] = __float22half2_rn(result[0]);
-        result_h[1] = __float22half2_rn(result[1]);
-
-        out_cast[j] = result_f;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        for (int i = high_index; i < N; i++) {
-            out[i] = __float2half((float)Xdata[i] * scale * mask[i]);
-        }
-    }
-}
-
-template <typename T>
-void launch_dropout_grad(T* vals_out,
-                         const T* vals,
-                         uint8_t* mask,
-                         int total_count,
-                         float ratio,
-                         cudaStream_t stream)
-{
-    assert(unroll_factor == 4);
-
-    const float scale = 1. / (1. - ratio);
-    dropout_grad_kernel<<<DS_GET_BLOCKS(total_count / unroll_factor),
-                          DS_CUDA_NUM_THREADS,
-                          0,
-                          stream>>>(total_count, scale, vals, vals_out, mask);
-}
-template void launch_dropout_grad(float*,
-                                  const float* vals,
-                                  uint8_t* mask,
-                                  int total_count,
-                                  float ratio,
-                                  cudaStream_t stream);
-template void launch_dropout_grad(__half*,
-                                  const __half* vals,
-                                  uint8_t* mask,
-                                  int total_count,
-                                  float ratio,
-                                  cudaStream_t stream);
-
-__global__ void dropout_kernel(const int N,
-                               const int dim,
-                               const float ratio,
-                               const float* bias,
-                               float* Xdata,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int tid = threadIdx.x % (dim / unroll_factor);
-
-    curandStatePhilox4_32_10_t state;
-    curand_init(seed.first, idx, seed.second, &state);
-
-    float4* Xdata_cast = reinterpret_cast<float4*>(Xdata);
-    uint32_t* mask_32 = reinterpret_cast<uint32_t*>(mask);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 rand = curand_uniform4(&state);
-        uint32_t m_32;
-        uint8_t* m = (uint8_t*)&m_32;
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        float4 x_data = Xdata_cast[j];
-        float4 b_data = bias_cast[j % (dim / unroll_factor)];
-
-        x_data.x += b_data.x;
-        x_data.y += b_data.y;
-        x_data.z += b_data.z;
-        x_data.w += b_data.w;
-
-        x_data.x = x_data.x * scale * m[0];
-        x_data.y = x_data.y * scale * m[1];
-        x_data.z = x_data.z * scale * m[2];
-        x_data.w = x_data.w * scale * m[3];
-
-        mask_32[j] = m_32;
-        Xdata_cast[j] = x_data;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = curand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            float x_data = Xdata[i] + bias[i % dim];
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            Xdata[i] = x_data * scale * m;
-            mask[i] = m;
-        }
-    }
-}
-
-__global__ void dropout_kernel(const int N,
-                               const int dim,
-                               const float ratio,
-                               const __half* bias,
-                               __half* Xdata,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int tid = threadIdx.x % (dim / unroll_factor);
-
-    curandStatePhilox4_32_10_t state;
-    curand_init(seed.first, idx, seed.second, &state);
-
-    float2* Xdata_cast = reinterpret_cast<float2*>(Xdata);
-    uint32_t* mask_32 = reinterpret_cast<uint32_t*>(mask);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 rand = curand_uniform4(&state);
-
-        float2 data_f;
-        __half2* data_h = reinterpret_cast<__half2*>(&data_f);
-
-        float2 bias_f;
-        __half2* bias_h = reinterpret_cast<__half2*>(&bias_f);
-
-        data_f = Xdata_cast[j];
-        bias_f = bias_cast[j % (dim / unroll_factor)];
-
-        float2 data_h_0 = __half22float2(data_h[0]);
-        float2 data_h_1 = __half22float2(data_h[1]);
-
-        float2 bias_h_0 = __half22float2(bias_h[0]);
-        float2 bias_h_1 = __half22float2(bias_h[1]);
-
-        data_h_0.x += bias_h_0.x;
-        data_h_0.y += bias_h_0.y;
-        data_h_1.x += bias_h_1.x;
-        data_h_1.y += bias_h_1.y;
-
-        uint32_t m_32;
-        uint8_t* m = (uint8_t*)&m_32;
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        data_h_0.x = __float2half(data_h_0.x * scale * m[0]);
-        data_h_0.y = __float2half(data_h_0.y * scale * m[1]);
-        data_h_1.x = __float2half(data_h_1.x * scale * m[2]);
-        data_h_1.y = __float2half(data_h_1.y * scale * m[3]);
-
-        float2 result_f;
-        __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-        result_h[0] = __float22half2_rn(data_h_0);
-        result_h[1] = __float22half2_rn(data_h_1);
-
-        Xdata_cast[j] = result_f;
-        mask_32[j] = m_32;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = curand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            float x_data = (float)Xdata[i] + (float)bias[i % dim];
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            Xdata[i] = __float2half(x_data * scale * m);
-            mask[i] = m;
-        }
-    }
-}
-
-template <typename T>
-void launch_dropout(T* out,
-                    const T* bias,
-                    uint8_t* mask,
-                    int batch,
-                    int dim,
-                    float ratio,
-                    cudaStream_t stream)
-{
-    assert(unroll_factor == 4);
-
-    int total_count = batch * dim / unroll_factor;
-
-    dim3 grid_dim = DS_GET_BLOCKS(total_count);
-    dim3 block_dim = DS_CUDA_NUM_THREADS;
-
-    uint64_t inc = (batch * dim) / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
-
-    dropout_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        total_count, dim, ratio, bias, out, mask, seed);
-}
-
-template void launch_dropout(float*,
-                             const float* bias,
-                             uint8_t* mask,
-                             int batch,
-                             int dim,
-                             float ratio,
-                             cudaStream_t stream);
-template void launch_dropout(__half*,
-                             const __half* bias,
-                             uint8_t* mask,
-                             int batch,
-                             int dim,
-                             float ratio,
-                             cudaStream_t stream);
-
-__global__ void dropout_kernel(const int N,
-                               const int dim,
-                               const float ratio,
-                               const float* input,
-                               const float* residual,
-                               const float* bias,
-                               float* out,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int tid = threadIdx.x % (dim / unroll_factor);
-
-    curandStatePhilox4_32_10_t state;
-    curand_init(seed.first, idx, seed.second, &state);
-
-    float4* out_cast = reinterpret_cast<float4*>(out);
-    uint32_t* mask_32 = reinterpret_cast<uint32_t*>(mask);
-
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-    const float4* residual_cast = reinterpret_cast<const float4*>(residual);
-    const float4* input_cast = reinterpret_cast<const float4*>(input);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 rand = curand_uniform4(&state);
-
-        uint32_t m_32;
-        uint8_t* m = (uint8_t*)&m_32;
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        float4 out_data;
-        float4 b_data = bias_cast[j % (dim / unroll_factor)];
-        float4 res_data = residual_cast[j];
-        float4 inp_data = input_cast[j];
-
-        out_data.x = (b_data.x + inp_data.x);
-        out_data.y = (b_data.y + inp_data.y);
-        out_data.z = (b_data.z + inp_data.z);
-        out_data.w = (b_data.w + inp_data.w);
-
-        out_data.x = out_data.x * scale * m[0];
-        out_data.y = out_data.y * scale * m[1];
-        out_data.z = out_data.z * scale * m[2];
-        out_data.w = out_data.w * scale * m[3];
-
-        out_data.x += res_data.x;
-        out_data.y += res_data.y;
-        out_data.z += res_data.z;
-        out_data.w += res_data.w;
-
-        mask_32[j] = m_32;
-        out_cast[j] = out_data;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = curand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            float x_data = input[i] + bias[i % dim];
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            x_data = x_data * scale * m;
-            x_data += residual[i];
-
-            out[i] = x_data;
-            mask[i] = m;
-        }
-    }
-}
-
-__global__ void dropout_kernel(const int N,
-                               const int dim,
-                               const float ratio,
-                               const __half* input,
-                               const __half* residual,
-                               const __half* bias,
-                               __half* out,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int tid = threadIdx.x % (dim / unroll_factor);
-
-    curandStatePhilox4_32_10_t state;
-    curand_init(seed.first, idx, seed.second, &state);
-
-    float2* out_cast = reinterpret_cast<float2*>(out);
-    uint32_t* mask_32 = reinterpret_cast<uint32_t*>(mask);
-
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-    const float2* residual_cast = reinterpret_cast<const float2*>(residual);
-    const float2* input_cast = reinterpret_cast<const float2*>(input);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 rand = curand_uniform4(&state);
-
-        float2 data_f;
-        __half2* data_h = reinterpret_cast<__half2*>(&data_f);
-
-        float2 bias_f;
-        __half2* bias_h = reinterpret_cast<__half2*>(&bias_f);
-
-        float2 residual_f;
-        __half2* residual_h = reinterpret_cast<__half2*>(&residual_f);
-
-        float2 input_f;
-        __half2* input_h = reinterpret_cast<__half2*>(&input_f);
-
-        bias_f = bias_cast[j % (dim / unroll_factor)];
-        residual_f = residual_cast[j];
-        input_f = input_cast[j];
-
-        float2 data_h_0 = __half22float2(data_h[0]);
-        float2 data_h_1 = __half22float2(data_h[1]);
-
-        float2 bias_h_0 = __half22float2(bias_h[0]);
-        float2 bias_h_1 = __half22float2(bias_h[1]);
-
-        float2 residual_h_0 = __half22float2(residual_h[0]);
-        float2 residual_h_1 = __half22float2(residual_h[1]);
-
-        float2 input_h_0 = __half22float2(input_h[0]);
-        float2 input_h_1 = __half22float2(input_h[1]);
-
-        data_h_0.x = (bias_h_0.x + input_h_0.x);
-        data_h_0.y = (bias_h_0.y + input_h_0.y);
-        data_h_1.x = (bias_h_1.x + input_h_1.x);
-        data_h_1.y = (bias_h_1.y + input_h_1.y);
-
-        uint32_t m_32;
-        uint8_t* m = (uint8_t*)&m_32;
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        data_h_0.x = __float2half(data_h_0.x * scale * m[0]);
-        data_h_0.y = __float2half(data_h_0.y * scale * m[1]);
-        data_h_1.x = __float2half(data_h_1.x * scale * m[2]);
-        data_h_1.y = __float2half(data_h_1.y * scale * m[3]);
-
-        data_h_0.x += residual_h_0.x;
-        data_h_0.y += residual_h_0.y;
-        data_h_1.x += residual_h_1.x;
-        data_h_1.y += residual_h_1.y;
-
-        float2 result_f;
-        __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-        result_h[0] = __float22half2_rn(data_h_0);
-        result_h[1] = __float22half2_rn(data_h_1);
-
-        out_cast[j] = result_f;
-        mask_32[j] = m_32;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = curand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            float x_data = (float)input[i] + (float)bias[i % dim];
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            x_data = x_data * scale * m;
-            x_data += (float)residual[i];
-
-            out[i] = __float2half(x_data);
-            mask[i] = m;
-        }
-    }
-}
-
-template <typename T>
-void launch_dropout(T* out,
-                    const T* input,
-                    const T* residual,
-                    const T* bias,
-                    uint8_t* mask,
-                    int batch,
-                    int dim,
-                    float ratio,
-                    cudaStream_t stream)
-{
-    assert(unroll_factor == 4);
-
-    int total_count = batch * dim / unroll_factor;
-    dim3 grid_dim = DS_GET_BLOCKS(total_count);
-    dim3 block_dim = DS_CUDA_NUM_THREADS;
-
-    uint64_t inc = (batch * dim) / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
-
-    dropout_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        total_count, dim, ratio, input, residual, bias, out, mask, seed);
-}
-
-template void launch_dropout(float*,
-                             const float*,
-                             const float* residual,
-                             const float* bias,
-                             uint8_t* mask,
-                             int batch,
-                             int dim,
-                             float ratio,
-                             cudaStream_t stream);
-template void launch_dropout(__half*,
-                             const __half*,
-                             const __half* residual,
-                             const __half* bias,
-                             uint8_t* mask,
-                             int batch,
-                             int dim,
-                             float ratio,
-                             cudaStream_t stream);
diff --git a/deepspeed/ops/csrc/transformer_bak/dropout_kernels.hip b/deepspeed/ops/csrc/transformer_bak/dropout_kernels.hip
deleted file mode 100644
index a4b880a721e9833d10bccd0fa438acf6b14ded54..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/dropout_kernels.hip
+++ /dev/null
@@ -1,870 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-
-const int unroll_factor = 4;
-
-__global__ void dropout_kernel(const int N,
-                               const float ratio,
-                               float* out,
-                               const float* Xdata,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    hiprandStatePhilox4_32_10_t state;
-    hiprand_init(seed.first, idx, seed.second, &state);
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        float4 rand = hiprand_uniform4(&state);
-        uint8_t m[unroll_factor];
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        int i = j * unroll_factor;
-
-        mask[i] = (uint8_t)m[0];
-        mask[i + 1] = (uint8_t)m[1];
-        mask[i + 2] = (uint8_t)m[2];
-        mask[i + 3] = (uint8_t)m[3];
-
-        out[i] = Xdata[i] * scale * m[0];
-        out[i + 1] = Xdata[i + 1] * scale * m[1];
-        out[i + 2] = Xdata[i + 2] * scale * m[2];
-        out[i + 3] = Xdata[i + 3] * scale * m[3];
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = hiprand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            out[i] = Xdata[i] * scale * m;
-            mask[i] = m;
-        }
-    }
-}
-
-__global__ void dropout_kernel(const int N,
-                               const float ratio,
-                               __half* out,
-                               const __half* Xdata,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    hiprandStatePhilox4_32_10_t state;
-    hiprand_init(seed.first, idx, seed.second, &state);
-
-#ifdef __STOCHASTIC_MODE__
-
-    const __half2 h_scale = __float2half2_rn(scale);
-    const float2* x_cast = reinterpret_cast<const float2*>(Xdata);
-    float2* out_cast = reinterpret_cast<float2*>(out);
-    uint32_t* mask_cast = reinterpret_cast<uint32_t*>(mask);
-
-    uint32_t m_32;
-    uint8_t* m = reinterpret_cast<uint8_t*>(&m_32);
-
-    float2 result_f;
-    __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-    __half2 mask_h[2];
-    float2 mask_f[2];
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        float2 x_f = x_cast[j];
-        __half2* x_h = reinterpret_cast<__half2*>(&x_f);
-
-        float4 rand = hiprand_uniform4(&state);
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        float* mask_f_data = &mask_f[0].x;
-#pragma unroll
-        for (int i = 0; i < unroll_factor; i++) mask_f_data[i] = (float)(m[i]);
-
-        mask_h[0] = __float22half2_rn(mask_f[0]);
-        mask_h[1] = __float22half2_rn(mask_f[1]);
-
-        result_h[0] = x_h[0] * h_scale * mask_h[0];
-        result_h[1] = x_h[1] * h_scale * mask_h[1];
-
-        out_cast[j] = result_f;
-
-        mask_cast[j] = m_32;
-    }
-
-#else
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        int i = j * unroll_factor;
-
-        const __half2* vals_half = reinterpret_cast<const __half2*>(Xdata + i);
-        float2 vals_half_f[2];
-        vals_half_f[0] = __half22float2(vals_half[0]);
-        vals_half_f[1] = __half22float2(vals_half[1]);
-
-        uint8_t m[unroll_factor];
-        float4 rand = hiprand_uniform4(&state);
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        out[i] = __float2half(vals_half_f[0].x * scale * m[0]);
-        out[i + 1] = __float2half(vals_half_f[0].y * scale * m[1]);
-        out[i + 2] = __float2half(vals_half_f[1].x * scale * m[2]);
-        out[i + 3] = __float2half(vals_half_f[1].y * scale * m[3]);
-
-        mask[i] = m[0];
-        mask[i + 1] = m[1];
-        mask[i + 2] = m[2];
-        mask[i + 3] = m[3];
-    }
-
-#endif
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = hiprand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            out[i] = __float2half((float)Xdata[i] * scale * m);
-            mask[i] = m;
-        }
-    }
-}
-
-__global__ void dropout_kernel_bwd(const int N,
-                                   const float ratio,
-                                   const float* Xdata,
-                                   float* out,
-                                   uint8_t* mask,
-                                   std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        int i = j * unroll_factor;
-
-        out[i] = mask[i] ? Xdata[i] * scale : 0.0;
-        out[i + 1] = mask[i + 1] ? Xdata[i + 1] * scale : 0.0;
-        out[i + 2] = mask[i + 2] ? Xdata[i + 2] * scale : 0.0;
-        out[i + 3] = mask[i + 3] ? Xdata[i + 3] * scale : 0.0;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        for (int i = high_index; i < N; i++) { out[i] = mask[i] ? Xdata[i] * scale : 0.0; }
-    }
-}
-
-__global__ void dropout_kernel_bwd(const int N,
-                                   const float ratio,
-                                   const __half* Xdata,
-                                   __half* out,
-                                   uint8_t* mask,
-                                   std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-
-#ifdef __STOCHASTIC_MODE__
-
-    const __half2 h_scale = __float2half2_rn(scale);
-
-    const float2* x_cast = reinterpret_cast<const float2*>(Xdata);
-    float2* out_cast = reinterpret_cast<float2*>(out);
-    uint32_t* mask_cast = reinterpret_cast<uint32_t*>(mask);
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        float2 x_f = x_cast[j];
-        __half2* x_h = reinterpret_cast<__half2*>(&x_f);
-
-        uint32_t m_32 = mask_cast[j];
-        uint8_t* m = (uint8_t*)&m_32;
-
-        __half2 mask_h[2];
-        float2 mask_f[2];
-
-        float* mask_f_data = &mask_f[0].x;
-#pragma unroll
-        for (int i = 0; i < unroll_factor; i++) mask_f_data[i] = (float)(m[i]);
-
-#pragma unroll
-        for (int i = 0; i < 2; i++) mask_h[i] = __float22half2_rn(mask_f[i]);
-
-        float2 result_f;
-        __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-        result_h[0] = x_h[0] * h_scale * mask_h[0];
-        result_h[1] = x_h[1] * h_scale * mask_h[1];
-
-        out_cast[j] = result_f;
-    }
-
-#else
-
-    const __half h_scale = __float2half(scale);
-    const __half h_zero = __float2half(0.0);
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        int i = j * unroll_factor;
-
-        const __half2* vals_half = reinterpret_cast<const __half2*>(Xdata + i);
-
-        uint8_t* m = mask + i;
-
-        float2 vals_half_f[2];
-
-        vals_half_f[0] = __half22float2(vals_half[0]);
-        vals_half_f[1] = __half22float2(vals_half[1]);
-
-        out[i] = __float2half(vals_half_f[0].x * scale * m[0]);
-        out[i + 1] = __float2half(vals_half_f[0].y * scale * m[1]);
-        out[i + 2] = __float2half(vals_half_f[1].x * scale * m[2]);
-        out[i + 3] = __float2half(vals_half_f[1].y * scale * m[3]);
-    }
-
-#endif
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        for (int i = high_index; i < N; i++) {
-            out[i] = __float2half((float)Xdata[i] * scale * mask[i]);
-        }
-    }
-}
-
-template <typename T>
-void launch_dropout(T* out,
-                    const T* vals,
-                    uint8_t* mask,
-                    int total_count,
-                    int dim,
-                    float ratio,
-                    hipStream_t stream,
-                    bool bwd)
-{
-    assert(unroll_factor == 4);
-
-    dim3 grid_dim = DS_GET_BLOCKS(total_count / unroll_factor);
-    dim3 block_dim = DS_CUDA_NUM_THREADS;
-
-    if (dim > 512) {
-        block_dim.x >>= 1;
-        grid_dim.x <<= 1;
-    }
-    uint64_t inc = total_count / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
-    if (bwd)
-       hipLaunchKernelGGL(( dropout_kernel_bwd), dim3(grid_dim), dim3(block_dim), 0, stream, 
-            total_count, ratio, vals, out, mask, seed);
-    else
-       hipLaunchKernelGGL(( dropout_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, 
-            total_count, ratio, out, vals, mask, seed);
-}
-
-template void launch_dropout(float* out,
-                             const float* vals,
-                             uint8_t* mask,
-                             int total_count,
-                             int dim,
-                             float ratio,
-                             hipStream_t stream,
-                             bool);
-template void launch_dropout(__half* out,
-                             const __half* vals,
-                             uint8_t* mask,
-                             int total_count,
-                             int dim,
-                             float ratio,
-                             hipStream_t stream,
-                             bool);
-
-__global__ void dropout_grad_kernel(const int N, const float scale, float* Xdata, uint8_t* mask)
-{
-    CUDA_1D_KERNEL_LOOP(i, N) { Xdata[i] *= scale * mask[i]; }
-}
-
-__global__ void dropout_grad_kernel(const int N, const float scale, __half* Xdata, uint8_t* mask)
-{
-    const __half2 h_scale = __float2half2_rn(scale);
-    float2* x_cast = reinterpret_cast<float2*>(Xdata);
-    uint32_t* mask_cast = reinterpret_cast<uint32_t*>(mask);
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        float2 x_data = x_cast[j];
-        uint32_t m_32 = mask_cast[j];
-        uint8_t* m = (uint8_t*)&m_32;
-
-        float2 result_f;
-        __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-#ifdef __STOCHASTIC_MODE__
-
-        __half2* x_data_h = reinterpret_cast<__half2*>(&x_data);
-        __half2 mask_h[2];
-        float2 mask_f[2];
-
-        float* mask_f_data = &mask_f[0].x;
-#pragma unroll
-        for (int i = 0; i < unroll_factor; i++) *(mask_f_data++) = (float)(m[i]);
-
-        mask_h[0] = __float22half2_rn(mask_f[0]);
-        mask_h[1] = __float22half2_rn(mask_f[1]);
-
-        result_h[0] = x_data_h[0] * h_scale * mask_h[0];
-        result_h[1] = x_data_h[1] * h_scale * mask_h[1];
-
-#else
-
-        __half* x_data_h = reinterpret_cast<__half*>(&x_data);
-        float2 result[2];
-
-        result[0].x = (float)x_data_h[0] * scale * m[0];
-        result[0].y = (float)x_data_h[1] * scale * m[1];
-        result[1].x = (float)x_data_h[2] * scale * m[2];
-        result[1].y = (float)x_data_h[3] * scale * m[3];
-
-        result_h[0] = __float22half2_rn(result[0]);
-        result_h[1] = __float22half2_rn(result[1]);
-
-#endif
-        x_cast[j] = result_f;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        for (int i = high_index; i < N; i++) {
-            Xdata[i] = __float2half((float)Xdata[i] * scale * mask[i]);
-        }
-    }
-}
-
-template <typename T>
-void launch_dropout_grad(T* vals, uint8_t* mask, int total_count, float ratio, hipStream_t stream)
-{
-    assert(unroll_factor == 4);
-
-    const float scale = 1. / (1. - ratio);
-   hipLaunchKernelGGL(( dropout_grad_kernel), dim3(DS_GET_BLOCKS(total_count / unroll_factor)),
-                          dim3(DS_CUDA_NUM_THREADS),
-                          0,
-                          stream, total_count, scale, vals, mask);
-}
-
-template void launch_dropout_grad(float* vals,
-                                  uint8_t* mask,
-                                  int total_count,
-                                  float ratio,
-                                  hipStream_t stream);
-template void launch_dropout_grad(__half* vals,
-                                  uint8_t* mask,
-                                  int total_count,
-                                  float ratio,
-                                  hipStream_t stream);
-
-__global__ void dropout_grad_kernel(const int N,
-                                    const float scale,
-                                    const float* Xdata,
-                                    float* out,
-                                    uint8_t* mask)
-{
-    CUDA_1D_KERNEL_LOOP(i, N) { out[i] = Xdata[i] * scale * mask[i]; }
-}
-
-__global__ void dropout_grad_kernel(const int N,
-                                    const float scale,
-                                    const __half* Xdata,
-                                    __half* out,
-                                    uint8_t* mask)
-{
-    const float2* x_cast = reinterpret_cast<const float2*>(Xdata);
-    float2* out_cast = reinterpret_cast<float2*>(out);
-    const uint32_t* mask_cast = reinterpret_cast<const uint32_t*>(mask);
-
-    float2 result_f;
-    __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
-    {
-        float2 x_data = x_cast[j];
-        uint32_t m_32 = mask_cast[j];
-        uint8_t* m = (uint8_t*)&m_32;
-
-        __half* x_data_h = reinterpret_cast<__half*>(&x_data);
-        float2 result[2];
-
-        result[0].x = (float)x_data_h[0] * scale * m[0];
-        result[0].y = (float)x_data_h[1] * scale * m[1];
-        result[1].x = (float)x_data_h[2] * scale * m[2];
-        result[1].y = (float)x_data_h[3] * scale * m[3];
-
-        result_h[0] = __float22half2_rn(result[0]);
-        result_h[1] = __float22half2_rn(result[1]);
-
-        out_cast[j] = result_f;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        for (int i = high_index; i < N; i++) {
-            out[i] = __float2half((float)Xdata[i] * scale * mask[i]);
-        }
-    }
-}
-
-template <typename T>
-void launch_dropout_grad(T* vals_out,
-                         const T* vals,
-                         uint8_t* mask,
-                         int total_count,
-                         float ratio,
-                         hipStream_t stream)
-{
-    assert(unroll_factor == 4);
-
-    const float scale = 1. / (1. - ratio);
-   hipLaunchKernelGGL(( dropout_grad_kernel), dim3(DS_GET_BLOCKS(total_count / unroll_factor)),
-                          dim3(DS_CUDA_NUM_THREADS),
-                          0,
-                          stream, total_count, scale, vals, vals_out, mask);
-}
-template void launch_dropout_grad(float*,
-                                  const float* vals,
-                                  uint8_t* mask,
-                                  int total_count,
-                                  float ratio,
-                                  hipStream_t stream);
-template void launch_dropout_grad(__half*,
-                                  const __half* vals,
-                                  uint8_t* mask,
-                                  int total_count,
-                                  float ratio,
-                                  hipStream_t stream);
-
-__global__ void dropout_kernel(const int N,
-                               const int dim,
-                               const float ratio,
-                               const float* bias,
-                               float* Xdata,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int tid = threadIdx.x % (dim / unroll_factor);
-
-    hiprandStatePhilox4_32_10_t state;
-    hiprand_init(seed.first, idx, seed.second, &state);
-
-    float4* Xdata_cast = reinterpret_cast<float4*>(Xdata);
-    uint32_t* mask_32 = reinterpret_cast<uint32_t*>(mask);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 rand = hiprand_uniform4(&state);
-        uint32_t m_32;
-        uint8_t* m = (uint8_t*)&m_32;
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        float4 x_data = Xdata_cast[j];
-        float4 b_data = bias_cast[j % (dim / unroll_factor)];
-
-        x_data.x += b_data.x;
-        x_data.y += b_data.y;
-        x_data.z += b_data.z;
-        x_data.w += b_data.w;
-
-        x_data.x = x_data.x * scale * m[0];
-        x_data.y = x_data.y * scale * m[1];
-        x_data.z = x_data.z * scale * m[2];
-        x_data.w = x_data.w * scale * m[3];
-
-        mask_32[j] = m_32;
-        Xdata_cast[j] = x_data;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = hiprand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            float x_data = Xdata[i] + bias[i % dim];
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            Xdata[i] = x_data * scale * m;
-            mask[i] = m;
-        }
-    }
-}
-
-__global__ void dropout_kernel(const int N,
-                               const int dim,
-                               const float ratio,
-                               const __half* bias,
-                               __half* Xdata,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int tid = threadIdx.x % (dim / unroll_factor);
-
-    hiprandStatePhilox4_32_10_t state;
-    hiprand_init(seed.first, idx, seed.second, &state);
-
-    float2* Xdata_cast = reinterpret_cast<float2*>(Xdata);
-    uint32_t* mask_32 = reinterpret_cast<uint32_t*>(mask);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 rand = hiprand_uniform4(&state);
-
-        float2 data_f;
-        __half2* data_h = reinterpret_cast<__half2*>(&data_f);
-
-        float2 bias_f;
-        __half2* bias_h = reinterpret_cast<__half2*>(&bias_f);
-
-        data_f = Xdata_cast[j];
-        bias_f = bias_cast[j % (dim / unroll_factor)];
-
-        float2 data_h_0 = __half22float2(data_h[0]);
-        float2 data_h_1 = __half22float2(data_h[1]);
-
-        float2 bias_h_0 = __half22float2(bias_h[0]);
-        float2 bias_h_1 = __half22float2(bias_h[1]);
-
-        data_h_0.x += bias_h_0.x;
-        data_h_0.y += bias_h_0.y;
-        data_h_1.x += bias_h_1.x;
-        data_h_1.y += bias_h_1.y;
-
-        uint32_t m_32;
-        uint8_t* m = (uint8_t*)&m_32;
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        data_h_0.x = __float2half(data_h_0.x * scale * m[0]);
-        data_h_0.y = __float2half(data_h_0.y * scale * m[1]);
-        data_h_1.x = __float2half(data_h_1.x * scale * m[2]);
-        data_h_1.y = __float2half(data_h_1.y * scale * m[3]);
-
-        float2 result_f;
-        __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-        result_h[0] = __float22half2_rn(data_h_0);
-        result_h[1] = __float22half2_rn(data_h_1);
-
-        Xdata_cast[j] = result_f;
-        mask_32[j] = m_32;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = hiprand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            float x_data = (float)Xdata[i] + (float)bias[i % dim];
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            Xdata[i] = __float2half(x_data * scale * m);
-            mask[i] = m;
-        }
-    }
-}
-
-template <typename T>
-void launch_dropout(T* out,
-                    const T* bias,
-                    uint8_t* mask,
-                    int batch,
-                    int dim,
-                    float ratio,
-                    hipStream_t stream)
-{
-    assert(unroll_factor == 4);
-
-    int total_count = batch * dim / unroll_factor;
-
-    dim3 grid_dim = DS_GET_BLOCKS(total_count);
-    dim3 block_dim = DS_CUDA_NUM_THREADS;
-
-    uint64_t inc = (batch * dim) / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
-
-   hipLaunchKernelGGL(( dropout_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        total_count, dim, ratio, bias, out, mask, seed);
-}
-
-template void launch_dropout(float*,
-                             const float* bias,
-                             uint8_t* mask,
-                             int batch,
-                             int dim,
-                             float ratio,
-                             hipStream_t stream);
-template void launch_dropout(__half*,
-                             const __half* bias,
-                             uint8_t* mask,
-                             int batch,
-                             int dim,
-                             float ratio,
-                             hipStream_t stream);
-
-__global__ void dropout_kernel(const int N,
-                               const int dim,
-                               const float ratio,
-                               const float* input,
-                               const float* residual,
-                               const float* bias,
-                               float* out,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int tid = threadIdx.x % (dim / unroll_factor);
-
-    hiprandStatePhilox4_32_10_t state;
-    hiprand_init(seed.first, idx, seed.second, &state);
-
-    float4* out_cast = reinterpret_cast<float4*>(out);
-    uint32_t* mask_32 = reinterpret_cast<uint32_t*>(mask);
-
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-    const float4* residual_cast = reinterpret_cast<const float4*>(residual);
-    const float4* input_cast = reinterpret_cast<const float4*>(input);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 rand = hiprand_uniform4(&state);
-
-        uint32_t m_32;
-        uint8_t* m = (uint8_t*)&m_32;
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        float4 out_data;
-        float4 b_data = bias_cast[j % (dim / unroll_factor)];
-        float4 res_data = residual_cast[j];
-        float4 inp_data = input_cast[j];
-
-        out_data.x = (b_data.x + inp_data.x);
-        out_data.y = (b_data.y + inp_data.y);
-        out_data.z = (b_data.z + inp_data.z);
-        out_data.w = (b_data.w + inp_data.w);
-
-        out_data.x = out_data.x * scale * m[0];
-        out_data.y = out_data.y * scale * m[1];
-        out_data.z = out_data.z * scale * m[2];
-        out_data.w = out_data.w * scale * m[3];
-
-        out_data.x += res_data.x;
-        out_data.y += res_data.y;
-        out_data.z += res_data.z;
-        out_data.w += res_data.w;
-
-        mask_32[j] = m_32;
-        out_cast[j] = out_data;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = hiprand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            float x_data = input[i] + bias[i % dim];
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            x_data = x_data * scale * m;
-            x_data += residual[i];
-
-            out[i] = x_data;
-            mask[i] = m;
-        }
-    }
-}
-
-__global__ void dropout_kernel(const int N,
-                               const int dim,
-                               const float ratio,
-                               const __half* input,
-                               const __half* residual,
-                               const __half* bias,
-                               __half* out,
-                               uint8_t* mask,
-                               std::pair<uint64_t, uint64_t> seed)
-{
-    const float scale = 1. / (1. - ratio);
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int tid = threadIdx.x % (dim / unroll_factor);
-
-    hiprandStatePhilox4_32_10_t state;
-    hiprand_init(seed.first, idx, seed.second, &state);
-
-    float2* out_cast = reinterpret_cast<float2*>(out);
-    uint32_t* mask_32 = reinterpret_cast<uint32_t*>(mask);
-
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-    const float2* residual_cast = reinterpret_cast<const float2*>(residual);
-    const float2* input_cast = reinterpret_cast<const float2*>(input);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 rand = hiprand_uniform4(&state);
-
-        float2 data_f;
-        __half2* data_h = reinterpret_cast<__half2*>(&data_f);
-
-        float2 bias_f;
-        __half2* bias_h = reinterpret_cast<__half2*>(&bias_f);
-
-        float2 residual_f;
-        __half2* residual_h = reinterpret_cast<__half2*>(&residual_f);
-
-        float2 input_f;
-        __half2* input_h = reinterpret_cast<__half2*>(&input_f);
-
-        bias_f = bias_cast[j % (dim / unroll_factor)];
-        residual_f = residual_cast[j];
-        input_f = input_cast[j];
-
-        float2 data_h_0 = __half22float2(data_h[0]);
-        float2 data_h_1 = __half22float2(data_h[1]);
-
-        float2 bias_h_0 = __half22float2(bias_h[0]);
-        float2 bias_h_1 = __half22float2(bias_h[1]);
-
-        float2 residual_h_0 = __half22float2(residual_h[0]);
-        float2 residual_h_1 = __half22float2(residual_h[1]);
-
-        float2 input_h_0 = __half22float2(input_h[0]);
-        float2 input_h_1 = __half22float2(input_h[1]);
-
-        data_h_0.x = (bias_h_0.x + input_h_0.x);
-        data_h_0.y = (bias_h_0.y + input_h_0.y);
-        data_h_1.x = (bias_h_1.x + input_h_1.x);
-        data_h_1.y = (bias_h_1.y + input_h_1.y);
-
-        uint32_t m_32;
-        uint8_t* m = (uint8_t*)&m_32;
-
-        m[0] = (uint8_t)(rand.x > ratio);
-        m[1] = (uint8_t)(rand.y > ratio);
-        m[2] = (uint8_t)(rand.z > ratio);
-        m[3] = (uint8_t)(rand.w > ratio);
-
-        data_h_0.x = __float2half(data_h_0.x * scale * m[0]);
-        data_h_0.y = __float2half(data_h_0.y * scale * m[1]);
-        data_h_1.x = __float2half(data_h_1.x * scale * m[2]);
-        data_h_1.y = __float2half(data_h_1.y * scale * m[3]);
-
-        data_h_0.x += residual_h_0.x;
-        data_h_0.y += residual_h_0.y;
-        data_h_1.x += residual_h_1.x;
-        data_h_1.y += residual_h_1.y;
-
-        float2 result_f;
-        __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-        result_h[0] = __float22half2_rn(data_h_0);
-        result_h[1] = __float22half2_rn(data_h_1);
-
-        out_cast[j] = result_f;
-        mask_32[j] = m_32;
-    }
-    int high_index =
-        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
-    if (N > high_index) {
-        float4 rand = hiprand_uniform4(&state);
-        float* rand_data = &(rand.x);
-        int k = 0;
-        for (int i = high_index; i < N; i++) {
-            float x_data = (float)input[i] + (float)bias[i % dim];
-            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
-            x_data = x_data * scale * m;
-            x_data += (float)residual[i];
-
-            out[i] = __float2half(x_data);
-            mask[i] = m;
-        }
-    }
-}
-
-template <typename T>
-void launch_dropout(T* out,
-                    const T* input,
-                    const T* residual,
-                    const T* bias,
-                    uint8_t* mask,
-                    int batch,
-                    int dim,
-                    float ratio,
-                    hipStream_t stream)
-{
-    assert(unroll_factor == 4);
-
-    int total_count = batch * dim / unroll_factor;
-    dim3 grid_dim = DS_GET_BLOCKS(total_count);
-    dim3 block_dim = DS_CUDA_NUM_THREADS;
-
-    uint64_t inc = (batch * dim) / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
-
-   hipLaunchKernelGGL(( dropout_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        total_count, dim, ratio, input, residual, bias, out, mask, seed);
-}
-
-template void launch_dropout(float*,
-                             const float*,
-                             const float* residual,
-                             const float* bias,
-                             uint8_t* mask,
-                             int batch,
-                             int dim,
-                             float ratio,
-                             hipStream_t stream);
-template void launch_dropout(__half*,
-                             const __half*,
-                             const __half* residual,
-                             const __half* bias,
-                             uint8_t* mask,
-                             int batch,
-                             int dim,
-                             float ratio,
-                             hipStream_t stream);
diff --git a/deepspeed/ops/csrc/transformer_bak/ds_transformer_cuda.cpp b/deepspeed/ops/csrc/transformer_bak/ds_transformer_cuda.cpp
deleted file mode 100644
index 18e7fffc1f5ddcd28588589742cf384c0fd96080..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/ds_transformer_cuda.cpp
+++ /dev/null
@@ -1,1051 +0,0 @@
-#include <torch/extension.h>
-
-#include <cublas_v2.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <type_traits>
-#include <unordered_map>
-#include <vector>
-#include "Timer.h"
-#include "context.h"
-#include "cublas_wrappers.h"
-#include "custom_cuda_layers.h"
-#include "ds_transformer_cuda.h"
-
-static std::unordered_map<int, std::shared_ptr<void>> s_transformer_layers;
-
-const int init_seq_length = 128;
-
-// C++ interface
-
-template <typename T>
-unsigned get_workspace_size(unsigned maxBatchSize,
-                            unsigned seq_len,
-                            unsigned hidden_size,
-                            unsigned intermediate_size,
-                            unsigned heads,
-                            bool training,
-                            bool gelu_checkpoint)
-{
-    unsigned workSpacesize = 4 * (size_t(maxBatchSize) * seq_len * hidden_size);
-    if (training) {
-        workSpacesize += 2 * (size_t(maxBatchSize) * seq_len * hidden_size);
-        workSpacesize += ((std::max)((size_t(maxBatchSize) * seq_len * intermediate_size),
-                                     2 * (size_t(maxBatchSize) * heads * seq_len * seq_len)));
-        if (gelu_checkpoint)
-            workSpacesize += 2 * (size_t(maxBatchSize) * seq_len * intermediate_size);
-    }
-    return workSpacesize;  // * sizeof(T);
-}
-
-// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) \
-    CHECK_CUDA(x);     \
-    CHECK_CONTIGUOUS(x)
-
-template <typename T>
-BertTransformerLayer<T>::BertTransformerLayer(unsigned layer_id,
-                                              unsigned batch_size,
-                                              unsigned hidden_size,
-                                              unsigned num_heads,
-                                              unsigned intermediate_size,
-                                              unsigned seq_length,
-                                              float attn_prob_dropout_ratio,
-                                              float hidden_output_dropout_ratio,
-                                              float layer_norm_eps,
-                                              bool pre_or_postLayerNorm,
-                                              const std::vector<std::array<int, 3>>& gemm_algos,
-                                              bool attn_dropout_checkpoint,
-                                              bool normalize_invertible,
-                                              bool gelu_checkpoint,
-                                              bool stochastic_mode)
-    : _layer_id(layer_id),
-      _batch_size(batch_size),
-      _hidden_size(hidden_size),
-      _heads(num_heads),
-      _intermediate_size(intermediate_size),
-      _seq_length(seq_length),
-      _training(true),
-      _pre_or_postLayerNorm(pre_or_postLayerNorm),
-      _attn_dropout_checkpoint(attn_dropout_checkpoint),
-      _normalize_invertible(normalize_invertible),
-      _gelu_checkpoint(gelu_checkpoint),
-      _stochastic_mode(stochastic_mode),
-      _stream(Context::Instance().GetCurrentStream()),
-      _cublasHandle(Context::Instance().GetCublasHandle()),
-      _qkv_linear(typename FeedForward<T>::Config(batch_size * seq_length,
-                                                  3 * hidden_size,
-                                                  hidden_size,
-                                                  gemm_algos[0])),
-      _attn_out_linear(typename FeedForward<T>::Config(batch_size * seq_length,
-                                                       hidden_size,
-                                                       hidden_size,
-                                                       gemm_algos[0])),
-      _attn_layer_norm(typename Normalize_Layer<T>::Config(batch_size,
-                                                           seq_length,
-                                                           hidden_size,
-                                                           layer_norm_eps,
-                                                           true,
-                                                           !normalize_invertible)),
-      _layer_norm(typename Normalize_Layer<T>::Config(batch_size,
-                                                      seq_length,
-                                                      hidden_size,
-                                                      layer_norm_eps,
-                                                      true,
-                                                      !normalize_invertible)),
-      _ff1(typename FeedForward<T>::Config(batch_size * seq_length,
-                                           _intermediate_size,
-                                           hidden_size,
-                                           gemm_algos[1])),
-      _ff2(typename FeedForward<T>::Config(batch_size * seq_length,
-                                           hidden_size,
-                                           _intermediate_size,
-                                           gemm_algos[2])),
-      _softmax(typename Softmax<T>::Config(batch_size, num_heads, seq_length)),
-      _gelu(typename Gelu<T>::Config(_intermediate_size)),
-      _attn_prob_dropout(typename Dropout<T>::Config(attn_prob_dropout_ratio, _seq_length)),
-      _attn_output_dropout(typename Dropout<T>::Config(hidden_output_dropout_ratio, _hidden_size)),
-      _layer_output_dropout(typename Dropout<T>::Config(hidden_output_dropout_ratio, _hidden_size)),
-      _attn_scores(typename StridedBatchGemm<T>::Config(_batch_size * _heads,
-                                                        _seq_length,
-                                                        _seq_length,
-                                                        _hidden_size / _heads,
-                                                        //aiss debug 0506
-                                                        //(T(1.0) / T(sqrt(_hidden_size / _heads))),
-                                                        (T(1.0 / (sqrt(_hidden_size / _heads)))),
-                                                        T(0.0),
-                                                        CUBLAS_OP_T,
-                                                        CUBLAS_OP_N,
-                                                        gemm_algos[3])),
-      _attn_context(typename StridedBatchGemm<T>::Config(_batch_size * _heads,
-                                                         _hidden_size / _heads,
-                                                         _seq_length,
-                                                         _seq_length,
-                                                         T(1.0),
-                                                         T(0.0),
-                                                         CUBLAS_OP_N,
-                                                         CUBLAS_OP_N,
-                                                         gemm_algos[4]))
-{
-    assert(_hidden_size % _heads == 0);
-
-    Initialize();
-}
-
-template <typename T>
-BertTransformerLayer<T>::~BertTransformerLayer()
-{
-}
-
-template <typename T>
-void BertTransformerLayer<T>::Initialize()
-{
-#ifndef __HIP_PLATFORM_HCC__
-    if (std::is_same<T, __half>::value) cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
-#endif
-}
-
-template <typename T>
-void BertTransformerLayer<T>::Forward(unsigned bsz,
-                                      const T* input_ptr,
-                                      const T* input_mask_ptr,
-                                      const T* attn_qkvw_ptr,
-                                      const T* attn_qkvb_ptr,
-                                      const T* attn_ow_ptr,
-                                      const T* attn_ob_ptr,
-                                      const T* attn_nw_ptr,
-                                      const T* attn_nb_ptr,
-                                      const T* inter_w_ptr,
-                                      const T* inter_b_ptr,
-                                      const T* output_w_ptr,
-                                      const T* output_b_ptr,
-                                      const T* norm_w_ptr,
-                                      const T* norm_b_ptr,
-                                      T* out_ptr,
-                                      T* inp_norm_ptr,
-                                      T* q_tf_ptr,
-                                      T* k_tf_ptr,
-                                      T* v_tf_ptr,
-                                      T* soft_out_ptr,
-                                      T* ctx_bufB_ptr,
-                                      T* attn_o_inp_ptr,
-                                      T* add_res_ptr,
-                                      T* ff1_inp_ptr,
-                                      T* gelu_inp_ptr,
-                                      T* ff2_inp_ptr)
-{
-    cublasSetStream(_cublasHandle, _stream);
-
-    if (!_stochastic_mode) cudaStreamSynchronize(_stream);
-
-    T* workspace = static_cast<T*>(Context::Instance().GetWorkSpace());
-    size_t small_buf_size = bsz * _seq_length * _hidden_size;
-    T* buf_0 = workspace;
-    T* buf_1 = buf_0 + small_buf_size;
-    T* buf_2 = buf_1;
-
-    if (_normalize_invertible) {
-        add_res_ptr = buf_1 + 3 * small_buf_size;
-        buf_2 = add_res_ptr;
-    }
-    if (_gelu_checkpoint) buf_2 += small_buf_size;
-    if (_attn_dropout_checkpoint)
-        ctx_bufB_ptr =
-            (_gelu_checkpoint ? (buf_2 + (_intermediate_size / _hidden_size) * small_buf_size)
-                              : (buf_1 + 4 * small_buf_size));
-
-    int bsz_seq = bsz * _seq_length;
-
-    if (_pre_or_postLayerNorm) {
-        if (_layer_norm.UseMean())
-            _layer_norm.ForwardCheckpoint(
-                bsz_seq, inp_norm_ptr, input_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
-
-        else
-            _layer_norm.Forward(
-                bsz_seq, inp_norm_ptr, input_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
-    }
-
-    if (_pre_or_postLayerNorm)
-        _qkv_linear.Forward(bsz_seq, inp_norm_ptr, attn_qkvw_ptr, buf_0, _cublasHandle);
-    else
-        _qkv_linear.Forward(bsz_seq, input_ptr, attn_qkvw_ptr, buf_0, _cublasHandle);
-
-    launch_bias_add_transform_0213<T>(
-        q_tf_ptr, buf_0, attn_qkvb_ptr, bsz, _seq_length, _hidden_size, _heads, _stream, 3);
-
-    int bsz_heads = bsz * _heads;
-
-    // attention scores
-    _attn_scores.Forward(bsz_heads, soft_out_ptr, k_tf_ptr, q_tf_ptr, _cublasHandle);
-
-    // Softmax + Mask
-    _softmax.Forward(bsz, soft_out_ptr, input_mask_ptr, _stream);
-
-    // attn prob dropout.
-    _attn_prob_dropout.Forward(bsz_heads * _seq_length, ctx_bufB_ptr, soft_out_ptr, _stream);
-
-    // attention context
-    _attn_context.Forward(bsz_heads, buf_1, v_tf_ptr, ctx_bufB_ptr, _cublasHandle);
-
-    launch_transform4d_0213<T>(
-        attn_o_inp_ptr, buf_1, bsz, _heads, _seq_length, _hidden_size, _stream, 1);
-
-    if (_pre_or_postLayerNorm)
-        _attn_out_linear.Forward(bsz_seq, attn_o_inp_ptr, attn_ow_ptr, buf_1, _cublasHandle);
-    else
-        _attn_out_linear.Forward(bsz_seq, attn_o_inp_ptr, attn_ow_ptr, ff1_inp_ptr, _cublasHandle);
-
-    // attn output dropout.
-    if (_pre_or_postLayerNorm)
-        _attn_output_dropout.ForwardWithBias(
-            bsz_seq, add_res_ptr, buf_1, input_ptr, attn_ob_ptr, _stream);
-    else
-        _attn_output_dropout.ForwardWithBias(
-            bsz_seq, add_res_ptr, ff1_inp_ptr, input_ptr, attn_ob_ptr, _stream);
-
-    if (_pre_or_postLayerNorm) {
-        if (_attn_layer_norm.UseMean())
-            _attn_layer_norm.ForwardCheckpoint(
-                bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
-        else
-            _attn_layer_norm.Forward(
-                bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
-    } else {
-        if (_attn_layer_norm.UseMean())
-            _attn_layer_norm.ForwardCheckpoint(
-                bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
-        else
-            _attn_layer_norm.Forward(
-                bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
-    }
-
-    _ff1.Forward(bsz_seq,
-                 ff1_inp_ptr,
-                 inter_w_ptr,
-                 (_gelu_checkpoint ? ff2_inp_ptr : gelu_inp_ptr),
-                 _cublasHandle);
-
-    _gelu.ForwardWithBiasAdd(bsz_seq,
-                             (_gelu_checkpoint ? ff2_inp_ptr : gelu_inp_ptr),
-                             inter_b_ptr,
-                             (_gelu_checkpoint ? buf_2 : ff2_inp_ptr),
-                             _stream);
-
-    _ff2.Forward(
-        bsz_seq, (_gelu_checkpoint ? buf_2 : ff2_inp_ptr), output_w_ptr, out_ptr, _cublasHandle);
-
-    // layer output dropout.
-    if (_pre_or_postLayerNorm)
-        _layer_output_dropout.ForwardWithBias(
-            bsz_seq, out_ptr, out_ptr, add_res_ptr, output_b_ptr, _stream);
-    else
-        _layer_output_dropout.ForwardWithBias(
-            bsz_seq, inp_norm_ptr, out_ptr, ff1_inp_ptr, output_b_ptr, _stream);
-
-    if (!_pre_or_postLayerNorm) {
-        if (_layer_norm.UseMean())
-            _layer_norm.ForwardCheckpoint(
-                bsz_seq, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
-        else
-            _layer_norm.Forward(
-                bsz_seq, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
-    }
-}
-
-template <typename T>
-void BertTransformerLayer<T>::Backward(unsigned bsz,
-                                       const T* grad_output_ptr,
-                                       const T* input_ptr,
-                                       const T* output_ptr,
-                                       const T* inp_norm_ptr,
-                                       const T* q_tf_ptr,
-                                       const T* k_tf_ptr,
-                                       const T* v_tf_ptr,
-                                       const T* soft_out_ptr,
-                                       const T* ctx_bufB_ptr,
-                                       const T* attn_o_inp_ptr,
-                                       const T* add_res_ptr,
-                                       const T* ff1_inp_ptr,
-                                       const T* gelu_inp_ptr,
-                                       const T* ff2_inp_ptr,
-                                       const T* input_mask_ptr,
-                                       const T* attn_qkvw_ptr,
-                                       const T* attn_ow_ptr,
-                                       const T* attn_nw_ptr,
-                                       const T* attn_nb_ptr,
-                                       const T* inter_w_ptr,
-                                       const T* inter_b_ptr,
-                                       const T* output_w_ptr,
-                                       const T* norm_w_ptr,
-                                       const T* norm_b_ptr,
-
-                                       T* grad_input_ptr,
-                                       T* grad_attn_qkvw_ptr,
-                                       T* grad_attn_qkvb_ptr,
-                                       T* grad_attn_ow_ptr,
-                                       T* grad_attn_ob_ptr,
-                                       T* grad_attn_nw_ptr,
-                                       T* grad_attn_nb_ptr,
-                                       T* grad_inter_w_ptr,
-                                       T* grad_inter_b_ptr,
-                                       T* grad_output_w_ptr,
-                                       T* grad_output_b_ptr,
-                                       T* grad_norm_w_ptr,
-                                       T* grad_norm_b_ptr)
-{
-    cublasSetStream(_cublasHandle, _stream);
-
-    if (!_stochastic_mode) cudaStreamSynchronize(_stream);
-
-    T* workspace = static_cast<T*>(Context::Instance().GetWorkSpace());
-    size_t small_buf_size = bsz * _seq_length * _hidden_size;
-    T* buf_0 = workspace;
-    T* buf_1 = buf_0 + small_buf_size;
-    T* buf_2 = buf_1 + small_buf_size;
-    T* buf_3 = buf_2 + small_buf_size;
-
-    T* ff2_buf = (_gelu_checkpoint ? buf_3 + (bsz * _seq_length * _intermediate_size)
-                                   : buf_3 + small_buf_size);
-    T* ctx_bufB_ptr_recomp = ff2_buf + (_seq_length * _seq_length * bsz * _heads);
-
-    cudaStream_t streams[2] = {_stream, _stream};
-
-    int bsz_seq = bsz * _seq_length;
-    int bsz_heads = bsz * _heads;
-
-    if (!_pre_or_postLayerNorm) {
-        if (_layer_norm.UseMean())
-            _layer_norm.Backward(bsz_seq,
-                                 grad_output_ptr,
-                                 norm_w_ptr,
-                                 grad_norm_w_ptr,
-                                 grad_norm_b_ptr,
-                                 streams,
-                                 buf_1,
-                                 inp_norm_ptr);
-
-        else
-            _layer_norm.Backward(bsz_seq,
-                                 grad_output_ptr,
-                                 norm_w_ptr,
-                                 norm_b_ptr,
-                                 grad_norm_w_ptr,
-                                 grad_norm_b_ptr,
-                                 streams,
-                                 buf_1,
-                                 output_ptr);
-    }
-
-    if (_pre_or_postLayerNorm)
-        _layer_output_dropout.Backward(bsz_seq, buf_0, grad_output_ptr, _stream);
-    else
-        _layer_output_dropout.Backward(bsz_seq, buf_0, buf_1, _stream);
-
-    const T* layer_dropout_buf = _layer_output_dropout.HasDropout()
-                                     ? buf_0
-                                     : (_pre_or_postLayerNorm ? grad_output_ptr : buf_1);
-
-    if (_gelu_checkpoint)
-        _gelu.ForwardWithBiasAdd(bsz_seq, ff2_inp_ptr, inter_b_ptr, buf_2, _stream);
-    _ff2.Backward(bsz_seq,
-                  layer_dropout_buf,
-                  (_gelu_checkpoint ? buf_2 : ff2_inp_ptr),
-                  output_w_ptr,
-                  grad_output_w_ptr,
-                  grad_output_b_ptr,
-                  _cublasHandle,
-                  _stream,
-                  ff2_buf);
-
-    _gelu.Backward(
-        bsz_seq, ff2_buf, (_gelu_checkpoint ? ff2_inp_ptr : gelu_inp_ptr), inter_b_ptr, _stream);
-
-    _ff1.Backward(bsz_seq,
-                  ff2_buf,
-                  ff1_inp_ptr,
-                  inter_w_ptr,
-                  grad_inter_w_ptr,
-                  grad_inter_b_ptr,
-                  _cublasHandle,
-                  _stream,
-                  buf_3);
-
-    if (!_pre_or_postLayerNorm)
-        launch_fused_add2<T>(buf_2, buf_3, buf_1, bsz, _seq_length, _hidden_size, _stream);
-
-    if (_pre_or_postLayerNorm) {
-        if (_attn_layer_norm.UseMean())
-            _attn_layer_norm.BackwardFusedAdd(bsz_seq,
-                                              buf_3,
-                                              grad_output_ptr,
-                                              attn_nw_ptr,
-                                              grad_attn_nw_ptr,
-                                              grad_attn_nb_ptr,
-                                              streams,
-                                              buf_0,
-                                              add_res_ptr);
-
-        else
-            _attn_layer_norm.BackwardFusedAdd(bsz_seq,
-                                              buf_3,
-                                              grad_output_ptr,
-                                              attn_nw_ptr,
-                                              attn_nb_ptr,
-                                              grad_attn_nw_ptr,
-                                              grad_attn_nb_ptr,
-                                              streams,
-                                              buf_0,
-                                              ff1_inp_ptr);
-    } else {
-        if (_attn_layer_norm.UseMean())
-            _attn_layer_norm.Backward(bsz_seq,
-                                      buf_2,
-                                      attn_nw_ptr,
-                                      grad_attn_nw_ptr,
-                                      grad_attn_nb_ptr,
-                                      streams,
-                                      buf_0,
-                                      add_res_ptr);
-
-        else
-            _attn_layer_norm.Backward(bsz_seq,
-                                      buf_2,
-                                      attn_nw_ptr,
-                                      attn_nb_ptr,
-                                      grad_attn_nw_ptr,
-                                      grad_attn_nb_ptr,
-                                      streams,
-                                      buf_0,
-                                      ff1_inp_ptr);
-    }
-
-    _attn_output_dropout.Backward(bsz_seq, buf_2, buf_0, _stream);
-
-    T* attn_output_dropout_buf = _attn_output_dropout.HasDropout() ? buf_2 : buf_0;
-
-    _attn_out_linear.Backward(bsz_seq,
-                              attn_output_dropout_buf,
-                              attn_o_inp_ptr,
-                              attn_ow_ptr,
-                              grad_attn_ow_ptr,
-                              grad_attn_ob_ptr,
-                              _cublasHandle,
-                              _stream,
-                              buf_1);
-
-    launch_transform_0213<T>(buf_2, buf_1, bsz, _seq_length, _hidden_size, _heads, _stream);
-
-    if (_attn_prob_dropout.HasDropout()) {
-        if (_attn_dropout_checkpoint)
-            _attn_prob_dropout.Forward(
-                bsz_heads * _seq_length, ctx_bufB_ptr_recomp, soft_out_ptr, _stream, true);
-
-        _attn_context.Backward(bsz_heads,
-                               buf_2,
-                               v_tf_ptr,
-                               (_attn_dropout_checkpoint ? ctx_bufB_ptr_recomp : ctx_bufB_ptr),
-                               _cublasHandle,
-                               buf_3,
-                               ff2_buf);
-    } else
-        _attn_context.Backward(
-            bsz_heads, buf_2, v_tf_ptr, soft_out_ptr, _cublasHandle, buf_3, ff2_buf);
-
-    _attn_prob_dropout.Backward(bsz_heads * _seq_length, ff2_buf, _stream);
-
-    _softmax.Backward(bsz, ff2_buf, soft_out_ptr, _stream);
-
-    _attn_scores.Backward(bsz_heads, ff2_buf, k_tf_ptr, q_tf_ptr, _cublasHandle, buf_2, buf_1);
-
-    launch_transform4d_0213(ff2_buf, buf_1, bsz, _heads, _seq_length, _hidden_size, _stream, 3);
-
-    if (_pre_or_postLayerNorm)
-        _qkv_linear.Backward(bsz_seq,
-                             ff2_buf,
-                             inp_norm_ptr,
-                             attn_qkvw_ptr,
-                             grad_attn_qkvw_ptr,
-                             grad_attn_qkvb_ptr,
-                             _cublasHandle,
-                             _stream,
-                             buf_2);
-    else
-        _qkv_linear.Backward(bsz_seq,
-                             ff2_buf,
-                             input_ptr,
-                             attn_qkvw_ptr,
-                             grad_attn_qkvw_ptr,
-                             grad_attn_qkvb_ptr,
-                             _cublasHandle,
-                             _stream,
-                             buf_2);
-
-    if (_pre_or_postLayerNorm) {
-        if (_layer_norm.UseMean())
-            _layer_norm.BackwardFusedAdd(bsz_seq,
-                                         buf_2,
-                                         buf_0,
-                                         norm_w_ptr,
-                                         grad_norm_w_ptr,
-                                         grad_norm_b_ptr,
-                                         streams,
-                                         grad_input_ptr,
-                                         input_ptr);
-
-        else
-            _layer_norm.BackwardFusedAdd(bsz_seq,
-                                         buf_2,
-                                         buf_0,
-                                         norm_w_ptr,
-                                         norm_b_ptr,
-                                         grad_norm_w_ptr,
-                                         grad_norm_b_ptr,
-                                         streams,
-                                         grad_input_ptr,
-                                         inp_norm_ptr);
-    } else
-        launch_fused_add2<T>(grad_input_ptr, buf_2, buf_0, bsz, _seq_length, _hidden_size, _stream);
-}
-
-template <typename T>
-void BertTransformerLayer<T>::SetTrainingMode(bool training)
-{
-    // Dropout will be skipped when not in training model.
-    _attn_prob_dropout.SetTrainingMode(training);
-    _attn_output_dropout.SetTrainingMode(training);
-    _layer_output_dropout.SetTrainingMode(training);
-}
-
-template <typename T>
-void BertTransformerLayer<T>::SetIntermediateBuffers(uint8_t* attn_prob_dropout_mask_ptr,
-                                                     uint8_t* attn_output_dropout_mask_ptr,
-                                                     uint8_t* layer_output_dropout_mask_ptr,
-                                                     T* attn_layer_norm_var,
-                                                     T* attn_layer_norm_mean,
-                                                     T* layer_norm_var,
-                                                     T* layer_norm_mean)
-{
-    _attn_prob_dropout.SetMask(attn_prob_dropout_mask_ptr);
-    _attn_output_dropout.SetMask(attn_output_dropout_mask_ptr);
-    _layer_output_dropout.SetMask(layer_output_dropout_mask_ptr);
-
-    _attn_layer_norm.SetVar(attn_layer_norm_var);
-    _attn_layer_norm.SetMean(attn_layer_norm_mean);
-    _layer_norm.SetVar(layer_norm_var);
-    _layer_norm.SetMean(layer_norm_mean);
-}
-
-template <typename T>
-void BertTransformerLayer<T>::SetSeqLength(unsigned seq_len)
-{
-    _seq_length = seq_len;
-
-    _softmax.SetSeqLength(_seq_length);
-    _attn_prob_dropout.SetDimension(_seq_length);
-    _attn_scores.SetConfig(_seq_length, _seq_length, _hidden_size / _heads);
-    _attn_context.SetConfig(_hidden_size / _heads, _seq_length, _seq_length);
-}
-
-template <typename T>
-int create_transformer_layer(unsigned layer_id,
-                             unsigned batch_size,
-                             unsigned hidden_dim,
-                             unsigned num_heads,
-                             unsigned intermediate_size,
-                             float attn_dropout_ratio,
-                             float hidden_dropout_ratio,
-                             float layer_norm_eps,
-                             int seed,
-                             bool pre_or_postLayerNorm,
-                             bool test_gemm,
-                             bool attn_dropout_checkpoint,
-                             bool normalize_invertible,
-                             bool gelu_checkpoint,
-                             bool stochastic_mode)
-{
-    Context::Instance().SetSeed(seed);
-    Context::Instance().TestGemmFP16(
-        test_gemm, batch_size, init_seq_length, num_heads, hidden_dim / num_heads);
-
-    auto layer = std::make_shared<BertTransformerLayer<T>>(layer_id,
-                                                           batch_size,
-                                                           hidden_dim,
-                                                           num_heads,
-                                                           intermediate_size,
-                                                           init_seq_length,
-                                                           attn_dropout_ratio,
-                                                           hidden_dropout_ratio,
-                                                           layer_norm_eps,
-                                                           pre_or_postLayerNorm,
-                                                           Context::Instance().GetGemmAlgos(),
-                                                           attn_dropout_checkpoint,
-                                                           normalize_invertible,
-                                                           gelu_checkpoint,
-                                                           stochastic_mode);
-
-    s_transformer_layers[layer_id] = layer;
-
-    std::string dtype = (std::is_same<T, __half>::value) ? "half" : "float";
-
-    std::cout << "layer #" << layer_id << " is created with date type [" << dtype << "]."
-              << std::endl;
-
-    return 0;
-}
-
-template <typename T>
-std::vector<torch::Tensor> ds_transformer_forward(unsigned layer_id,
-                                                  const torch::Tensor& input,
-                                                  const torch::Tensor& input_mask,
-                                                  const torch::Tensor& attn_qkvw,
-                                                  const torch::Tensor& attn_qkvb,
-                                                  const torch::Tensor& attn_ow,
-                                                  const torch::Tensor& attn_ob,
-                                                  const torch::Tensor& attn_nw,
-                                                  const torch::Tensor& attn_nb,
-                                                  const torch::Tensor& inter_w,
-                                                  const torch::Tensor& inter_b,
-                                                  const torch::Tensor& output_w,
-                                                  const torch::Tensor& output_b,
-                                                  const torch::Tensor& norm_w,
-                                                  const torch::Tensor& norm_b,
-                                                  bool training_mode,
-                                                  bool prelayernorm,
-                                                  bool attn_dropout_checkpoint,
-                                                  bool normalize_invertible,
-                                                  bool gelu_checkpoint)
-{
-    CHECK_INPUT(input);
-    CHECK_INPUT(input_mask);
-    CHECK_INPUT(attn_qkvw);
-    CHECK_INPUT(attn_qkvb);
-    CHECK_INPUT(attn_ow);
-    CHECK_INPUT(attn_ob);
-    CHECK_INPUT(attn_nw);
-    CHECK_INPUT(attn_nb);
-    CHECK_INPUT(inter_w);
-    CHECK_INPUT(inter_b);
-    CHECK_INPUT(output_w);
-    CHECK_INPUT(output_b);
-    CHECK_INPUT(norm_w);
-    CHECK_INPUT(norm_b);
-
-    unsigned bsz = input.size(0);
-
-    const T* input_ptr = (const T*)input.data_ptr();
-    const T* input_mask_ptr = (const T*)input_mask.data_ptr();
-    const T* attn_qkvw_ptr = (const T*)attn_qkvw.data_ptr();
-    const T* attn_qkvb_ptr = (const T*)attn_qkvb.data_ptr();
-    const T* attn_ow_ptr = (const T*)attn_ow.data_ptr();
-    const T* attn_ob_ptr = (const T*)attn_ob.data_ptr();
-    const T* attn_nw_ptr = (const T*)attn_nw.data_ptr();
-    const T* attn_nb_ptr = (const T*)attn_nb.data_ptr();
-    const T* inter_w_ptr = (const T*)inter_w.data_ptr();
-    const T* inter_b_ptr = (const T*)inter_b.data_ptr();
-    const T* output_w_ptr = (const T*)output_w.data_ptr();
-    const T* output_b_ptr = (const T*)output_b.data_ptr();
-    const T* norm_w_ptr = (const T*)norm_w.data_ptr();
-    const T* norm_b_ptr = (const T*)norm_b.data_ptr();
-
-    auto output = torch::empty_like(input);
-    T* out_ptr = (T*)output.data_ptr();
-
-    auto options = torch::TensorOptions()
-                       .dtype(input.options().dtype())
-                       .layout(torch::kStrided)
-                       .device(torch::kCUDA)
-                       .requires_grad(true);
-
-    auto uint8_options = torch::TensorOptions()
-                             .dtype(torch::kInt8)
-                             .layout(torch::kStrided)
-                             .device(torch::kCUDA)
-                             .requires_grad(false);
-
-    std::shared_ptr<BertTransformerLayer<T>> layer =
-        std::static_pointer_cast<BertTransformerLayer<T>>(s_transformer_layers[layer_id]);
-
-    unsigned seq_len = layer->GetSeqLength();
-    if (input.size(1) != seq_len) {
-        seq_len = input.size(1);
-        layer->SetSeqLength(seq_len);
-    }
-
-    auto workspace = torch::empty({get_workspace_size<T>(bsz,
-                                                         seq_len,
-                                                         layer->GetHiddenSize(),
-                                                         layer->GetIntermediateSize(),
-                                                         layer->GetNumHeads(),
-                                                         layer->IsTrainingMode(),
-                                                         layer->GeluCheckpoint())},
-                                  options);
-    Context::Instance().SetWorkSpace((T*)workspace.data_ptr());
-
-    auto inp_norm = ((prelayernorm || !normalize_invertible) ? torch::empty_like(input) : output);
-    auto add_res = (normalize_invertible ? inp_norm : torch::empty_like(input));
-    auto attn_o_inp = torch::empty_like(input);
-    auto qkv_tf = torch::empty({(bsz * seq_len), output_w.size(0) * 3}, options);
-
-    auto attn_prob_dropout_mask =
-        torch::empty({(bsz * layer->GetNumHeads() * seq_len), seq_len}, uint8_options);
-    auto attn_output_dropout_mask =
-        torch::empty({(bsz * seq_len), layer->GetHiddenSize()}, uint8_options);
-    auto layer_output_dropout_mask =
-        torch::empty({(bsz * seq_len), layer->GetHiddenSize()}, uint8_options);
-
-    auto attn_layer_norm_var = torch::empty({(bsz * seq_len)}, options);
-    auto attn_layer_norm_mean = torch::empty({(bsz * seq_len)}, options);
-    auto layer_norm_var = torch::empty({(bsz * seq_len)}, options);
-    auto layer_norm_mean = torch::empty({(bsz * seq_len)}, options);
-
-    T* inp_norm_ptr = (T*)inp_norm.data_ptr();
-    T* add_res_ptr = (T*)add_res.data_ptr();
-    T* q_tf_ptr = (T*)qkv_tf.data_ptr();
-    T* k_tf_ptr = q_tf_ptr + (bsz * seq_len * output_w.size(0));  //(T*)k_tf.data_ptr();
-    T* v_tf_ptr = k_tf_ptr + (bsz * seq_len * output_w.size(0));  //(T*)v_tf.data_ptr();
-    T* attn_o_inp_ptr = (T*)attn_o_inp.data_ptr();
-
-    torch::Tensor ff2_inp = torch::empty({(bsz * seq_len), output_w.size(1)}, options);
-    torch::Tensor gelu_inp =
-        (gelu_checkpoint ? ff2_inp : torch::empty({(bsz * seq_len), output_w.size(1)}, options));
-    auto ff1_inp = torch::empty_like(input);
-    T* ff2_inp_ptr = (T*)ff2_inp.data_ptr();
-    T* gelu_inp_ptr = (T*)gelu_inp.data_ptr();
-    T* ff1_inp_ptr = (T*)ff1_inp.data_ptr();
-
-    torch::Tensor soft_out =
-        torch::empty({(bsz * layer->GetNumHeads() * seq_len), seq_len}, options);
-    torch::Tensor ctx_bufB =
-        (attn_dropout_checkpoint
-             ? soft_out
-             : torch::empty({(bsz * layer->GetNumHeads() * seq_len), seq_len}, options));
-    T* soft_out_ptr = (T*)soft_out.data_ptr();
-    T* ctx_bufB_ptr = (T*)ctx_bufB.data_ptr();
-
-    layer->SetTrainingMode(training_mode);
-    layer->SetIntermediateBuffers((uint8_t*)attn_prob_dropout_mask.data_ptr(),
-                                  (uint8_t*)attn_output_dropout_mask.data_ptr(),
-                                  (uint8_t*)layer_output_dropout_mask.data_ptr(),
-                                  (T*)attn_layer_norm_var.data_ptr(),
-                                  (T*)attn_layer_norm_mean.data_ptr(),
-                                  (T*)layer_norm_var.data_ptr(),
-                                  (T*)layer_norm_mean.data_ptr());
-
-    layer->Forward(bsz,
-                   input_ptr,
-                   input_mask_ptr,
-                   attn_qkvw_ptr,
-                   attn_qkvb_ptr,
-                   attn_ow_ptr,
-                   attn_ob_ptr,
-                   attn_nw_ptr,
-                   attn_nb_ptr,
-                   inter_w_ptr,
-                   inter_b_ptr,
-                   output_w_ptr,
-                   output_b_ptr,
-                   norm_w_ptr,
-                   norm_b_ptr,
-                   out_ptr,
-                   inp_norm_ptr,
-                   q_tf_ptr,
-                   k_tf_ptr,
-                   v_tf_ptr,
-                   soft_out_ptr,
-                   ctx_bufB_ptr,
-                   attn_o_inp_ptr,
-                   add_res_ptr,
-                   ff1_inp_ptr,
-                   gelu_inp_ptr,
-                   ff2_inp_ptr);
-
-    return {output,
-            inp_norm,
-            qkv_tf,
-            soft_out,
-            ctx_bufB,
-            attn_o_inp,
-            add_res,
-            ff1_inp,
-            gelu_inp,
-            ff2_inp,
-            attn_prob_dropout_mask,
-            attn_output_dropout_mask,
-            layer_output_dropout_mask,
-            attn_layer_norm_var,
-            attn_layer_norm_mean,
-            layer_norm_var,
-            layer_norm_mean};
-}
-
-template <typename T>
-std::vector<torch::Tensor> ds_transformer_backward(unsigned layer_id,
-                                                   const torch::Tensor& grad_output,
-                                                   const torch::Tensor& output,
-                                                   const torch::Tensor& inp_norm,
-                                                   const torch::Tensor& qkv_tf,
-                                                   const torch::Tensor& soft_out,
-                                                   const torch::Tensor& ctx_bufB,
-                                                   const torch::Tensor& attn_o_inp,
-                                                   const torch::Tensor& add_res,
-                                                   const torch::Tensor& ff1_inp,
-                                                   const torch::Tensor& gelu_inp,
-                                                   const torch::Tensor& ff2_inp,
-                                                   const torch::Tensor& attn_prob_dropout_mask,
-                                                   const torch::Tensor& attn_output_dropout_mask,
-                                                   const torch::Tensor& layer_output_dropout_mask,
-                                                   const torch::Tensor& attn_layer_norm_var,
-                                                   const torch::Tensor& attn_layer_norm_mean,
-                                                   const torch::Tensor& layer_norm_var,
-                                                   const torch::Tensor& layer_norm_mean,
-                                                   const torch::Tensor& input,
-                                                   const torch::Tensor& input_mask,
-                                                   const torch::Tensor& attn_qkvw,
-                                                   const torch::Tensor& attn_qkvb,
-                                                   const torch::Tensor& attn_ow,
-                                                   const torch::Tensor& attn_ob,
-                                                   const torch::Tensor& attn_nw,
-                                                   const torch::Tensor& attn_nb,
-                                                   const torch::Tensor& inter_w,
-                                                   const torch::Tensor& inter_b,
-                                                   const torch::Tensor& output_w,
-                                                   const torch::Tensor& output_b,
-                                                   const torch::Tensor& norm_w,
-                                                   const torch::Tensor& norm_b)
-{
-    auto g_output = grad_output.contiguous();
-    CHECK_INPUT(g_output);
-    CHECK_INPUT(output);
-    CHECK_INPUT(inp_norm);
-    CHECK_INPUT(qkv_tf);
-    CHECK_INPUT(add_res);
-    CHECK_INPUT(soft_out);
-    CHECK_INPUT(ctx_bufB);
-    CHECK_INPUT(attn_o_inp);
-    CHECK_INPUT(ff1_inp);
-    CHECK_INPUT(gelu_inp);
-    CHECK_INPUT(ff2_inp);
-    CHECK_INPUT(input);
-    CHECK_INPUT(input_mask);
-    CHECK_INPUT(attn_qkvw);
-    CHECK_INPUT(attn_qkvb);
-    CHECK_INPUT(attn_ow);
-    CHECK_INPUT(attn_ob);
-    CHECK_INPUT(attn_nw);
-    CHECK_INPUT(attn_nb);
-    CHECK_INPUT(inter_w);
-    CHECK_INPUT(inter_b);
-    CHECK_INPUT(output_w);
-    CHECK_INPUT(output_b);
-    CHECK_INPUT(norm_w);
-    CHECK_INPUT(norm_b);
-
-    unsigned bsz = g_output.size(0);
-
-    std::shared_ptr<BertTransformerLayer<T>> layer =
-        std::static_pointer_cast<BertTransformerLayer<T>>(s_transformer_layers[layer_id]);
-
-    unsigned seq_len = layer->GetSeqLength();
-    if (g_output.size(1) != seq_len) {
-        seq_len = g_output.size(1);
-        layer->SetSeqLength(seq_len);
-    }
-    auto options = torch::TensorOptions()
-                       .dtype(g_output.options().dtype())
-                       .layout(torch::kStrided)
-                       .device(torch::kCUDA)
-                       .requires_grad(true);
-    auto workspace = torch::empty({get_workspace_size<T>(bsz,
-                                                         seq_len,
-                                                         layer->GetHiddenSize(),
-                                                         layer->GetIntermediateSize(),
-                                                         layer->GetNumHeads(),
-                                                         layer->IsTrainingMode(),
-                                                         layer->GeluCheckpoint())},
-                                  options);
-    Context::Instance().SetWorkSpace((T*)workspace.data_ptr());
-
-    auto grad_input = torch::empty_like(input);
-    auto grad_attn_qkvw = torch::empty_like(attn_qkvw);
-    auto grad_attn_qkvb = torch::empty_like(attn_qkvb);
-    auto grad_attn_ow = torch::empty_like(attn_ow);
-    auto grad_attn_ob = torch::empty_like(attn_ob);
-    auto grad_attn_nw = torch::empty_like(attn_nw);
-    auto grad_attn_nb = torch::empty_like(attn_nb);
-    auto grad_inter_w = torch::empty_like(inter_w);
-    auto grad_inter_b = torch::empty_like(inter_b);
-    auto grad_output_w = torch::empty_like(output_w);
-    auto grad_output_b = torch::empty_like(output_b);
-    auto grad_norm_w = torch::empty_like(norm_w);
-    auto grad_norm_b = torch::empty_like(norm_b);
-
-    // inputs.
-    const T* grad_output_ptr = (const T*)g_output.data_ptr();
-    const T* input_ptr = (const T*)input.data_ptr();
-    const T* output_ptr = (const T*)output.data_ptr();
-    const T* inp_norm_ptr = (const T*)inp_norm.data_ptr();
-    const T* q_tf_ptr = (const T*)qkv_tf.data_ptr();
-    const T* add_res_ptr = (const T*)add_res.data_ptr();
-    const T* k_tf_ptr =
-        q_tf_ptr + (bsz * layer->GetSeqLength() * output_w.size(0));  //(const T*)k_tf.data_ptr();
-    const T* v_tf_ptr =
-        k_tf_ptr + (bsz * layer->GetSeqLength() * output_w.size(0));  //(const T*)v_tf.data_ptr();
-    const T* ff1_inp_ptr = (const T*)ff1_inp.data_ptr();
-    const T* gelu_inp_ptr = (const T*)gelu_inp.data_ptr();
-    const T* ff2_inp_ptr = (const T*)ff2_inp.data_ptr();
-    const T* ctx_bufB_ptr = (const T*)ctx_bufB.data_ptr();
-    const T* soft_out_ptr = (const T*)soft_out.data_ptr();
-    const T* attn_o_inp_ptr = (const T*)attn_o_inp.data_ptr();
-    const T* input_mask_ptr = (const T*)input_mask.data_ptr();
-    const T* attn_qkvw_ptr = (const T*)attn_qkvw.data_ptr();
-    const T* attn_ow_ptr = (const T*)attn_ow.data_ptr();
-    const T* attn_nw_ptr = (const T*)attn_nw.data_ptr();
-    const T* attn_nb_ptr = (const T*)attn_nb.data_ptr();
-    const T* inter_w_ptr = (const T*)inter_w.data_ptr();
-    const T* inter_b_ptr = (const T*)inter_b.data_ptr();
-    const T* output_w_ptr = (const T*)output_w.data_ptr();
-    const T* norm_w_ptr = (const T*)norm_w.data_ptr();
-    const T* norm_b_ptr = (const T*)norm_b.data_ptr();
-
-    // outputs.
-    T* grad_input_ptr = (T*)grad_input.data_ptr();
-    T* grad_attn_qkvw_ptr = (T*)grad_attn_qkvw.data_ptr();
-    T* grad_attn_qkvb_ptr = (T*)grad_attn_qkvb.data_ptr();
-    T* grad_attn_ow_ptr = (T*)grad_attn_ow.data_ptr();
-    T* grad_attn_ob_ptr = (T*)grad_attn_ob.data_ptr();
-    T* grad_attn_nw_ptr = (T*)grad_attn_nw.data_ptr();
-    T* grad_attn_nb_ptr = (T*)grad_attn_nb.data_ptr();
-    T* grad_inter_w_ptr = (T*)grad_inter_w.data_ptr();
-    T* grad_inter_b_ptr = (T*)grad_inter_b.data_ptr();
-    T* grad_output_w_ptr = (T*)grad_output_w.data_ptr();
-    T* grad_output_b_ptr = (T*)grad_output_b.data_ptr();
-    T* grad_norm_w_ptr = (T*)grad_norm_w.data_ptr();
-    T* grad_norm_b_ptr = (T*)grad_norm_b.data_ptr();
-
-    layer->SetIntermediateBuffers((uint8_t*)attn_prob_dropout_mask.data_ptr(),
-                                  (uint8_t*)attn_output_dropout_mask.data_ptr(),
-                                  (uint8_t*)layer_output_dropout_mask.data_ptr(),
-                                  (T*)attn_layer_norm_var.data_ptr(),
-                                  (T*)attn_layer_norm_mean.data_ptr(),
-                                  (T*)layer_norm_var.data_ptr(),
-                                  (T*)layer_norm_mean.data_ptr());
-
-    layer->Backward(bsz,
-                    grad_output_ptr,
-                    input_ptr,
-                    output_ptr,
-                    inp_norm_ptr,
-                    q_tf_ptr,
-                    k_tf_ptr,
-                    v_tf_ptr,
-                    soft_out_ptr,
-                    ctx_bufB_ptr,
-                    attn_o_inp_ptr,
-                    add_res_ptr,
-                    ff1_inp_ptr,
-                    gelu_inp_ptr,
-                    ff2_inp_ptr,
-                    input_mask_ptr,
-                    attn_qkvw_ptr,
-                    attn_ow_ptr,
-                    attn_nw_ptr,
-                    attn_nb_ptr,
-                    inter_w_ptr,
-                    inter_b_ptr,
-                    output_w_ptr,
-                    norm_w_ptr,
-                    norm_b_ptr,
-
-                    grad_input_ptr,
-                    grad_attn_qkvw_ptr,
-                    grad_attn_qkvb_ptr,
-                    grad_attn_ow_ptr,
-                    grad_attn_ob_ptr,
-                    grad_attn_nw_ptr,
-                    grad_attn_nb_ptr,
-                    grad_inter_w_ptr,
-                    grad_inter_b_ptr,
-                    grad_output_w_ptr,
-                    grad_output_b_ptr,
-                    grad_norm_w_ptr,
-                    grad_norm_b_ptr);
-
-    return {grad_input,
-            grad_attn_qkvw,
-            grad_attn_qkvb,
-            grad_attn_ow,
-            grad_attn_ob,
-            grad_attn_nw,
-            grad_attn_nb,
-            grad_inter_w,
-            grad_inter_b,
-            grad_output_w,
-            grad_output_b,
-            grad_norm_w,
-            grad_norm_b};
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("forward_fp32",
-          &ds_transformer_forward<float>,
-          "DeepSpeed Transformer forward with fp32 (CUDA)");
-    m.def("forward_fp16",
-          &ds_transformer_forward<__half>,
-          "DeepSpeed Transformer forward with fp16 (CUDA)");
-    m.def("backward_fp32",
-          &ds_transformer_backward<float>,
-          "DeepSpeed Transformer backward with fp32 (CUDA)");
-    m.def("backward_fp16",
-          &ds_transformer_backward<__half>,
-          "DeepSpeed Transformer backward with fp16 (CUDA)");
-    m.def("create_transformer_layer_fp32",
-          &create_transformer_layer<float>,
-          "Create DeepSpeed Transformer Transformer Layer with fp32 (CUDA)");
-    m.def("create_transformer_layer_fp16",
-          &create_transformer_layer<__half>,
-          "Create DeepSpeed Transformer Transformer Layer with fp16 (CUDA)");
-}
diff --git a/deepspeed/ops/csrc/transformer_bak/ds_transformer_hip.cpp b/deepspeed/ops/csrc/transformer_bak/ds_transformer_hip.cpp
deleted file mode 100644
index 7b47686737500b9c47ebf66a651c21a6590fc8e0..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/ds_transformer_hip.cpp
+++ /dev/null
@@ -1,1052 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <torch/extension.h>
-
-#include <rocblas.h>
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime.h>
-#include <type_traits>
-#include <unordered_map>
-#include <vector>
-#include "Timer_hip.h"
-#include "context_hip.h"
-#include "cublas_wrappers_hip.h"
-#include "custom_hip_layers.h"
-#include "ds_transformer_hip.h"
-
-static std::unordered_map<int, std::shared_ptr<void>> s_transformer_layers;
-
-const int init_seq_length = 128;
-
-// C++ interface
-
-template <typename T>
-unsigned get_workspace_size(unsigned maxBatchSize,
-                            unsigned seq_len,
-                            unsigned hidden_size,
-                            unsigned intermediate_size,
-                            unsigned heads,
-                            bool training,
-                            bool gelu_checkpoint)
-{
-    unsigned workSpacesize = 4 * (size_t(maxBatchSize) * seq_len * hidden_size);
-    if (training) {
-        workSpacesize += 2 * (size_t(maxBatchSize) * seq_len * hidden_size);
-        workSpacesize += ((std::max)((size_t(maxBatchSize) * seq_len * intermediate_size),
-                                     2 * (size_t(maxBatchSize) * heads * seq_len * seq_len)));
-        if (gelu_checkpoint)
-            workSpacesize += 2 * (size_t(maxBatchSize) * seq_len * intermediate_size);
-    }
-    return workSpacesize;  // * sizeof(T);
-}
-
-// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) \
-    CHECK_CUDA(x);     \
-    CHECK_CONTIGUOUS(x)
-
-template <typename T>
-BertTransformerLayer<T>::BertTransformerLayer(unsigned layer_id,
-                                              unsigned batch_size,
-                                              unsigned hidden_size,
-                                              unsigned num_heads,
-                                              unsigned intermediate_size,
-                                              unsigned seq_length,
-                                              float attn_prob_dropout_ratio,
-                                              float hidden_output_dropout_ratio,
-                                              float layer_norm_eps,
-                                              bool pre_or_postLayerNorm,
-                                              const std::vector<std::array<int, 3>>& gemm_algos,
-                                              bool attn_dropout_checkpoint,
-                                              bool normalize_invertible,
-                                              bool gelu_checkpoint,
-                                              bool stochastic_mode)
-    : _layer_id(layer_id),
-      _batch_size(batch_size),
-      _hidden_size(hidden_size),
-      _heads(num_heads),
-      _intermediate_size(intermediate_size),
-      _seq_length(seq_length),
-      _training(true),
-      _pre_or_postLayerNorm(pre_or_postLayerNorm),
-      _attn_dropout_checkpoint(attn_dropout_checkpoint),
-      _normalize_invertible(normalize_invertible),
-      _gelu_checkpoint(gelu_checkpoint),
-      _stochastic_mode(stochastic_mode),
-      _stream(Context::Instance().GetCurrentStream()),
-      _cublasHandle(Context::Instance().GetCublasHandle()),
-      _qkv_linear(typename FeedForward<T>::Config(batch_size * seq_length,
-                                                  3 * hidden_size,
-                                                  hidden_size,
-                                                  gemm_algos[0])),
-      _attn_out_linear(typename FeedForward<T>::Config(batch_size * seq_length,
-                                                       hidden_size,
-                                                       hidden_size,
-                                                       gemm_algos[0])),
-      _attn_layer_norm(typename Normalize_Layer<T>::Config(batch_size,
-                                                           seq_length,
-                                                           hidden_size,
-                                                           layer_norm_eps,
-                                                           true,
-                                                           !normalize_invertible)),
-      _layer_norm(typename Normalize_Layer<T>::Config(batch_size,
-                                                      seq_length,
-                                                      hidden_size,
-                                                      layer_norm_eps,
-                                                      true,
-                                                      !normalize_invertible)),
-      _ff1(typename FeedForward<T>::Config(batch_size * seq_length,
-                                           _intermediate_size,
-                                           hidden_size,
-                                           gemm_algos[1])),
-      _ff2(typename FeedForward<T>::Config(batch_size * seq_length,
-                                           hidden_size,
-                                           _intermediate_size,
-                                           gemm_algos[2])),
-      _softmax(typename Softmax<T>::Config(batch_size, num_heads, seq_length)),
-      _gelu(typename Gelu<T>::Config(_intermediate_size)),
-      _attn_prob_dropout(typename Dropout<T>::Config(attn_prob_dropout_ratio, _seq_length)),
-      _attn_output_dropout(typename Dropout<T>::Config(hidden_output_dropout_ratio, _hidden_size)),
-      _layer_output_dropout(typename Dropout<T>::Config(hidden_output_dropout_ratio, _hidden_size)),
-      _attn_scores(typename StridedBatchGemm<T>::Config(_batch_size * _heads,
-                                                        _seq_length,
-                                                        _seq_length,
-                                                        _hidden_size / _heads,
-                                                        //aiss debug 0506
-                                                        //(T(1.0) / T(sqrt(_hidden_size / _heads))),
-                                                        (T(1.0 / (sqrt(_hidden_size / _heads)))),
-                                                        T(0.0),
-                                                        rocblas_operation_transpose,
-                                                        rocblas_operation_none,
-                                                        gemm_algos[3])),
-      _attn_context(typename StridedBatchGemm<T>::Config(_batch_size * _heads,
-                                                         _hidden_size / _heads,
-                                                         _seq_length,
-                                                         _seq_length,
-                                                         T(1.0),
-                                                         T(0.0),
-                                                         rocblas_operation_none,
-                                                         rocblas_operation_none,
-                                                         gemm_algos[4]))
-{
-    assert(_hidden_size % _heads == 0);
-
-    Initialize();
-}
-
-template <typename T>
-BertTransformerLayer<T>::~BertTransformerLayer()
-{
-}
-
-template <typename T>
-void BertTransformerLayer<T>::Initialize()
-{
-#ifndef __HIP_PLATFORM_HCC__
-    if (std::is_same<T, __half>::value) rocblas_set_math_mode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
-#endif
-}
-
-template <typename T>
-void BertTransformerLayer<T>::Forward(unsigned bsz,
-                                      const T* input_ptr,
-                                      const T* input_mask_ptr,
-                                      const T* attn_qkvw_ptr,
-                                      const T* attn_qkvb_ptr,
-                                      const T* attn_ow_ptr,
-                                      const T* attn_ob_ptr,
-                                      const T* attn_nw_ptr,
-                                      const T* attn_nb_ptr,
-                                      const T* inter_w_ptr,
-                                      const T* inter_b_ptr,
-                                      const T* output_w_ptr,
-                                      const T* output_b_ptr,
-                                      const T* norm_w_ptr,
-                                      const T* norm_b_ptr,
-                                      T* out_ptr,
-                                      T* inp_norm_ptr,
-                                      T* q_tf_ptr,
-                                      T* k_tf_ptr,
-                                      T* v_tf_ptr,
-                                      T* soft_out_ptr,
-                                      T* ctx_bufB_ptr,
-                                      T* attn_o_inp_ptr,
-                                      T* add_res_ptr,
-                                      T* ff1_inp_ptr,
-                                      T* gelu_inp_ptr,
-                                      T* ff2_inp_ptr)
-{
-    rocblas_set_stream(_cublasHandle, _stream);
-
-    if (!_stochastic_mode) hipStreamSynchronize(_stream);
-
-    T* workspace = static_cast<T*>(Context::Instance().GetWorkSpace());
-    size_t small_buf_size = bsz * _seq_length * _hidden_size;
-    T* buf_0 = workspace;
-    T* buf_1 = buf_0 + small_buf_size;
-    T* buf_2 = buf_1;
-
-    if (_normalize_invertible) {
-        add_res_ptr = buf_1 + 3 * small_buf_size;
-        buf_2 = add_res_ptr;
-    }
-    if (_gelu_checkpoint) buf_2 += small_buf_size;
-    if (_attn_dropout_checkpoint)
-        ctx_bufB_ptr =
-            (_gelu_checkpoint ? (buf_2 + (_intermediate_size / _hidden_size) * small_buf_size)
-                              : (buf_1 + 4 * small_buf_size));
-
-    int bsz_seq = bsz * _seq_length;
-
-    if (_pre_or_postLayerNorm) {
-        if (_layer_norm.UseMean())
-            _layer_norm.ForwardCheckpoint(
-                bsz_seq, inp_norm_ptr, input_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
-
-        else
-            _layer_norm.Forward(
-                bsz_seq, inp_norm_ptr, input_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
-    }
-
-    if (_pre_or_postLayerNorm)
-        _qkv_linear.Forward(bsz_seq, inp_norm_ptr, attn_qkvw_ptr, buf_0, _cublasHandle);
-    else
-        _qkv_linear.Forward(bsz_seq, input_ptr, attn_qkvw_ptr, buf_0, _cublasHandle);
-
-    launch_bias_add_transform_0213<T>(
-        q_tf_ptr, buf_0, attn_qkvb_ptr, bsz, _seq_length, _hidden_size, _heads, _stream, 3);
-
-    int bsz_heads = bsz * _heads;
-
-    // attention scores
-    _attn_scores.Forward(bsz_heads, soft_out_ptr, k_tf_ptr, q_tf_ptr, _cublasHandle);
-
-    // Softmax + Mask
-    _softmax.Forward(bsz, soft_out_ptr, input_mask_ptr, _stream);
-
-    // attn prob dropout.
-    _attn_prob_dropout.Forward(bsz_heads * _seq_length, ctx_bufB_ptr, soft_out_ptr, _stream);
-
-    // attention context
-    _attn_context.Forward(bsz_heads, buf_1, v_tf_ptr, ctx_bufB_ptr, _cublasHandle);
-
-    launch_transform4d_0213<T>(
-        attn_o_inp_ptr, buf_1, bsz, _heads, _seq_length, _hidden_size, _stream, 1);
-
-    if (_pre_or_postLayerNorm)
-        _attn_out_linear.Forward(bsz_seq, attn_o_inp_ptr, attn_ow_ptr, buf_1, _cublasHandle);
-    else
-        _attn_out_linear.Forward(bsz_seq, attn_o_inp_ptr, attn_ow_ptr, ff1_inp_ptr, _cublasHandle);
-
-    // attn output dropout.
-    if (_pre_or_postLayerNorm)
-        _attn_output_dropout.ForwardWithBias(
-            bsz_seq, add_res_ptr, buf_1, input_ptr, attn_ob_ptr, _stream);
-    else
-        _attn_output_dropout.ForwardWithBias(
-            bsz_seq, add_res_ptr, ff1_inp_ptr, input_ptr, attn_ob_ptr, _stream);
-
-    if (_pre_or_postLayerNorm) {
-        if (_attn_layer_norm.UseMean())
-            _attn_layer_norm.ForwardCheckpoint(
-                bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
-        else
-            _attn_layer_norm.Forward(
-                bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
-    } else {
-        if (_attn_layer_norm.UseMean())
-            _attn_layer_norm.ForwardCheckpoint(
-                bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
-        else
-            _attn_layer_norm.Forward(
-                bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
-    }
-
-    _ff1.Forward(bsz_seq,
-                 ff1_inp_ptr,
-                 inter_w_ptr,
-                 (_gelu_checkpoint ? ff2_inp_ptr : gelu_inp_ptr),
-                 _cublasHandle);
-
-    _gelu.ForwardWithBiasAdd(bsz_seq,
-                             (_gelu_checkpoint ? ff2_inp_ptr : gelu_inp_ptr),
-                             inter_b_ptr,
-                             (_gelu_checkpoint ? buf_2 : ff2_inp_ptr),
-                             _stream);
-
-    _ff2.Forward(
-        bsz_seq, (_gelu_checkpoint ? buf_2 : ff2_inp_ptr), output_w_ptr, out_ptr, _cublasHandle);
-
-    // layer output dropout.
-    if (_pre_or_postLayerNorm)
-        _layer_output_dropout.ForwardWithBias(
-            bsz_seq, out_ptr, out_ptr, add_res_ptr, output_b_ptr, _stream);
-    else
-        _layer_output_dropout.ForwardWithBias(
-            bsz_seq, inp_norm_ptr, out_ptr, ff1_inp_ptr, output_b_ptr, _stream);
-
-    if (!_pre_or_postLayerNorm) {
-        if (_layer_norm.UseMean())
-            _layer_norm.ForwardCheckpoint(
-                bsz_seq, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
-        else
-            _layer_norm.Forward(
-                bsz_seq, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
-    }
-}
-
-template <typename T>
-void BertTransformerLayer<T>::Backward(unsigned bsz,
-                                       const T* grad_output_ptr,
-                                       const T* input_ptr,
-                                       const T* output_ptr,
-                                       const T* inp_norm_ptr,
-                                       const T* q_tf_ptr,
-                                       const T* k_tf_ptr,
-                                       const T* v_tf_ptr,
-                                       const T* soft_out_ptr,
-                                       const T* ctx_bufB_ptr,
-                                       const T* attn_o_inp_ptr,
-                                       const T* add_res_ptr,
-                                       const T* ff1_inp_ptr,
-                                       const T* gelu_inp_ptr,
-                                       const T* ff2_inp_ptr,
-                                       const T* input_mask_ptr,
-                                       const T* attn_qkvw_ptr,
-                                       const T* attn_ow_ptr,
-                                       const T* attn_nw_ptr,
-                                       const T* attn_nb_ptr,
-                                       const T* inter_w_ptr,
-                                       const T* inter_b_ptr,
-                                       const T* output_w_ptr,
-                                       const T* norm_w_ptr,
-                                       const T* norm_b_ptr,
-
-                                       T* grad_input_ptr,
-                                       T* grad_attn_qkvw_ptr,
-                                       T* grad_attn_qkvb_ptr,
-                                       T* grad_attn_ow_ptr,
-                                       T* grad_attn_ob_ptr,
-                                       T* grad_attn_nw_ptr,
-                                       T* grad_attn_nb_ptr,
-                                       T* grad_inter_w_ptr,
-                                       T* grad_inter_b_ptr,
-                                       T* grad_output_w_ptr,
-                                       T* grad_output_b_ptr,
-                                       T* grad_norm_w_ptr,
-                                       T* grad_norm_b_ptr)
-{
-    rocblas_set_stream(_cublasHandle, _stream);
-
-    if (!_stochastic_mode) hipStreamSynchronize(_stream);
-
-    T* workspace = static_cast<T*>(Context::Instance().GetWorkSpace());
-    size_t small_buf_size = bsz * _seq_length * _hidden_size;
-    T* buf_0 = workspace;
-    T* buf_1 = buf_0 + small_buf_size;
-    T* buf_2 = buf_1 + small_buf_size;
-    T* buf_3 = buf_2 + small_buf_size;
-
-    T* ff2_buf = (_gelu_checkpoint ? buf_3 + (bsz * _seq_length * _intermediate_size)
-                                   : buf_3 + small_buf_size);
-    T* ctx_bufB_ptr_recomp = ff2_buf + (_seq_length * _seq_length * bsz * _heads);
-
-    hipStream_t streams[2] = {_stream, _stream};
-
-    int bsz_seq = bsz * _seq_length;
-    int bsz_heads = bsz * _heads;
-
-    if (!_pre_or_postLayerNorm) {
-        if (_layer_norm.UseMean())
-            _layer_norm.Backward(bsz_seq,
-                                 grad_output_ptr,
-                                 norm_w_ptr,
-                                 grad_norm_w_ptr,
-                                 grad_norm_b_ptr,
-                                 streams,
-                                 buf_1,
-                                 inp_norm_ptr);
-
-        else
-            _layer_norm.Backward(bsz_seq,
-                                 grad_output_ptr,
-                                 norm_w_ptr,
-                                 norm_b_ptr,
-                                 grad_norm_w_ptr,
-                                 grad_norm_b_ptr,
-                                 streams,
-                                 buf_1,
-                                 output_ptr);
-    }
-
-    if (_pre_or_postLayerNorm)
-        _layer_output_dropout.Backward(bsz_seq, buf_0, grad_output_ptr, _stream);
-    else
-        _layer_output_dropout.Backward(bsz_seq, buf_0, buf_1, _stream);
-
-    const T* layer_dropout_buf = _layer_output_dropout.HasDropout()
-                                     ? buf_0
-                                     : (_pre_or_postLayerNorm ? grad_output_ptr : buf_1);
-
-    if (_gelu_checkpoint)
-        _gelu.ForwardWithBiasAdd(bsz_seq, ff2_inp_ptr, inter_b_ptr, buf_2, _stream);
-    _ff2.Backward(bsz_seq,
-                  layer_dropout_buf,
-                  (_gelu_checkpoint ? buf_2 : ff2_inp_ptr),
-                  output_w_ptr,
-                  grad_output_w_ptr,
-                  grad_output_b_ptr,
-                  _cublasHandle,
-                  _stream,
-                  ff2_buf);
-
-    _gelu.Backward(
-        bsz_seq, ff2_buf, (_gelu_checkpoint ? ff2_inp_ptr : gelu_inp_ptr), inter_b_ptr, _stream);
-
-    _ff1.Backward(bsz_seq,
-                  ff2_buf,
-                  ff1_inp_ptr,
-                  inter_w_ptr,
-                  grad_inter_w_ptr,
-                  grad_inter_b_ptr,
-                  _cublasHandle,
-                  _stream,
-                  buf_3);
-
-    if (!_pre_or_postLayerNorm)
-        launch_fused_add2<T>(buf_2, buf_3, buf_1, bsz, _seq_length, _hidden_size, _stream);
-
-    if (_pre_or_postLayerNorm) {
-        if (_attn_layer_norm.UseMean())
-            _attn_layer_norm.BackwardFusedAdd(bsz_seq,
-                                              buf_3,
-                                              grad_output_ptr,
-                                              attn_nw_ptr,
-                                              grad_attn_nw_ptr,
-                                              grad_attn_nb_ptr,
-                                              streams,
-                                              buf_0,
-                                              add_res_ptr);
-
-        else
-            _attn_layer_norm.BackwardFusedAdd(bsz_seq,
-                                              buf_3,
-                                              grad_output_ptr,
-                                              attn_nw_ptr,
-                                              attn_nb_ptr,
-                                              grad_attn_nw_ptr,
-                                              grad_attn_nb_ptr,
-                                              streams,
-                                              buf_0,
-                                              ff1_inp_ptr);
-    } else {
-        if (_attn_layer_norm.UseMean())
-            _attn_layer_norm.Backward(bsz_seq,
-                                      buf_2,
-                                      attn_nw_ptr,
-                                      grad_attn_nw_ptr,
-                                      grad_attn_nb_ptr,
-                                      streams,
-                                      buf_0,
-                                      add_res_ptr);
-
-        else
-            _attn_layer_norm.Backward(bsz_seq,
-                                      buf_2,
-                                      attn_nw_ptr,
-                                      attn_nb_ptr,
-                                      grad_attn_nw_ptr,
-                                      grad_attn_nb_ptr,
-                                      streams,
-                                      buf_0,
-                                      ff1_inp_ptr);
-    }
-
-    _attn_output_dropout.Backward(bsz_seq, buf_2, buf_0, _stream);
-
-    T* attn_output_dropout_buf = _attn_output_dropout.HasDropout() ? buf_2 : buf_0;
-
-    _attn_out_linear.Backward(bsz_seq,
-                              attn_output_dropout_buf,
-                              attn_o_inp_ptr,
-                              attn_ow_ptr,
-                              grad_attn_ow_ptr,
-                              grad_attn_ob_ptr,
-                              _cublasHandle,
-                              _stream,
-                              buf_1);
-
-    launch_transform_0213<T>(buf_2, buf_1, bsz, _seq_length, _hidden_size, _heads, _stream);
-
-    if (_attn_prob_dropout.HasDropout()) {
-        if (_attn_dropout_checkpoint)
-            _attn_prob_dropout.Forward(
-                bsz_heads * _seq_length, ctx_bufB_ptr_recomp, soft_out_ptr, _stream, true);
-
-        _attn_context.Backward(bsz_heads,
-                               buf_2,
-                               v_tf_ptr,
-                               (_attn_dropout_checkpoint ? ctx_bufB_ptr_recomp : ctx_bufB_ptr),
-                               _cublasHandle,
-                               buf_3,
-                               ff2_buf);
-    } else
-        _attn_context.Backward(
-            bsz_heads, buf_2, v_tf_ptr, soft_out_ptr, _cublasHandle, buf_3, ff2_buf);
-
-    _attn_prob_dropout.Backward(bsz_heads * _seq_length, ff2_buf, _stream);
-
-    _softmax.Backward(bsz, ff2_buf, soft_out_ptr, _stream);
-
-    _attn_scores.Backward(bsz_heads, ff2_buf, k_tf_ptr, q_tf_ptr, _cublasHandle, buf_2, buf_1);
-
-    launch_transform4d_0213(ff2_buf, buf_1, bsz, _heads, _seq_length, _hidden_size, _stream, 3);
-
-    if (_pre_or_postLayerNorm)
-        _qkv_linear.Backward(bsz_seq,
-                             ff2_buf,
-                             inp_norm_ptr,
-                             attn_qkvw_ptr,
-                             grad_attn_qkvw_ptr,
-                             grad_attn_qkvb_ptr,
-                             _cublasHandle,
-                             _stream,
-                             buf_2);
-    else
-        _qkv_linear.Backward(bsz_seq,
-                             ff2_buf,
-                             input_ptr,
-                             attn_qkvw_ptr,
-                             grad_attn_qkvw_ptr,
-                             grad_attn_qkvb_ptr,
-                             _cublasHandle,
-                             _stream,
-                             buf_2);
-
-    if (_pre_or_postLayerNorm) {
-        if (_layer_norm.UseMean())
-            _layer_norm.BackwardFusedAdd(bsz_seq,
-                                         buf_2,
-                                         buf_0,
-                                         norm_w_ptr,
-                                         grad_norm_w_ptr,
-                                         grad_norm_b_ptr,
-                                         streams,
-                                         grad_input_ptr,
-                                         input_ptr);
-
-        else
-            _layer_norm.BackwardFusedAdd(bsz_seq,
-                                         buf_2,
-                                         buf_0,
-                                         norm_w_ptr,
-                                         norm_b_ptr,
-                                         grad_norm_w_ptr,
-                                         grad_norm_b_ptr,
-                                         streams,
-                                         grad_input_ptr,
-                                         inp_norm_ptr);
-    } else
-        launch_fused_add2<T>(grad_input_ptr, buf_2, buf_0, bsz, _seq_length, _hidden_size, _stream);
-}
-
-template <typename T>
-void BertTransformerLayer<T>::SetTrainingMode(bool training)
-{
-    // Dropout will be skipped when not in training model.
-    _attn_prob_dropout.SetTrainingMode(training);
-    _attn_output_dropout.SetTrainingMode(training);
-    _layer_output_dropout.SetTrainingMode(training);
-}
-
-template <typename T>
-void BertTransformerLayer<T>::SetIntermediateBuffers(uint8_t* attn_prob_dropout_mask_ptr,
-                                                     uint8_t* attn_output_dropout_mask_ptr,
-                                                     uint8_t* layer_output_dropout_mask_ptr,
-                                                     T* attn_layer_norm_var,
-                                                     T* attn_layer_norm_mean,
-                                                     T* layer_norm_var,
-                                                     T* layer_norm_mean)
-{
-    _attn_prob_dropout.SetMask(attn_prob_dropout_mask_ptr);
-    _attn_output_dropout.SetMask(attn_output_dropout_mask_ptr);
-    _layer_output_dropout.SetMask(layer_output_dropout_mask_ptr);
-
-    _attn_layer_norm.SetVar(attn_layer_norm_var);
-    _attn_layer_norm.SetMean(attn_layer_norm_mean);
-    _layer_norm.SetVar(layer_norm_var);
-    _layer_norm.SetMean(layer_norm_mean);
-}
-
-template <typename T>
-void BertTransformerLayer<T>::SetSeqLength(unsigned seq_len)
-{
-    _seq_length = seq_len;
-
-    _softmax.SetSeqLength(_seq_length);
-    _attn_prob_dropout.SetDimension(_seq_length);
-    _attn_scores.SetConfig(_seq_length, _seq_length, _hidden_size / _heads);
-    _attn_context.SetConfig(_hidden_size / _heads, _seq_length, _seq_length);
-}
-
-template <typename T>
-int create_transformer_layer(unsigned layer_id,
-                             unsigned batch_size,
-                             unsigned hidden_dim,
-                             unsigned num_heads,
-                             unsigned intermediate_size,
-                             float attn_dropout_ratio,
-                             float hidden_dropout_ratio,
-                             float layer_norm_eps,
-                             int seed,
-                             bool pre_or_postLayerNorm,
-                             bool test_gemm,
-                             bool attn_dropout_checkpoint,
-                             bool normalize_invertible,
-                             bool gelu_checkpoint,
-                             bool stochastic_mode)
-{
-    Context::Instance().SetSeed(seed);
-    Context::Instance().TestGemmFP16(
-        test_gemm, batch_size, init_seq_length, num_heads, hidden_dim / num_heads);
-
-    auto layer = std::make_shared<BertTransformerLayer<T>>(layer_id,
-                                                           batch_size,
-                                                           hidden_dim,
-                                                           num_heads,
-                                                           intermediate_size,
-                                                           init_seq_length,
-                                                           attn_dropout_ratio,
-                                                           hidden_dropout_ratio,
-                                                           layer_norm_eps,
-                                                           pre_or_postLayerNorm,
-                                                           Context::Instance().GetGemmAlgos(),
-                                                           attn_dropout_checkpoint,
-                                                           normalize_invertible,
-                                                           gelu_checkpoint,
-                                                           stochastic_mode);
-
-    s_transformer_layers[layer_id] = layer;
-
-    std::string dtype = (std::is_same<T, __half>::value) ? "half" : "float";
-
-    std::cout << "layer #" << layer_id << " is created with date type [" << dtype << "]."
-              << std::endl;
-
-    return 0;
-}
-
-template <typename T>
-std::vector<torch::Tensor> ds_transformer_forward(unsigned layer_id,
-                                                  const torch::Tensor& input,
-                                                  const torch::Tensor& input_mask,
-                                                  const torch::Tensor& attn_qkvw,
-                                                  const torch::Tensor& attn_qkvb,
-                                                  const torch::Tensor& attn_ow,
-                                                  const torch::Tensor& attn_ob,
-                                                  const torch::Tensor& attn_nw,
-                                                  const torch::Tensor& attn_nb,
-                                                  const torch::Tensor& inter_w,
-                                                  const torch::Tensor& inter_b,
-                                                  const torch::Tensor& output_w,
-                                                  const torch::Tensor& output_b,
-                                                  const torch::Tensor& norm_w,
-                                                  const torch::Tensor& norm_b,
-                                                  bool training_mode,
-                                                  bool prelayernorm,
-                                                  bool attn_dropout_checkpoint,
-                                                  bool normalize_invertible,
-                                                  bool gelu_checkpoint)
-{
-    CHECK_INPUT(input);
-    CHECK_INPUT(input_mask);
-    CHECK_INPUT(attn_qkvw);
-    CHECK_INPUT(attn_qkvb);
-    CHECK_INPUT(attn_ow);
-    CHECK_INPUT(attn_ob);
-    CHECK_INPUT(attn_nw);
-    CHECK_INPUT(attn_nb);
-    CHECK_INPUT(inter_w);
-    CHECK_INPUT(inter_b);
-    CHECK_INPUT(output_w);
-    CHECK_INPUT(output_b);
-    CHECK_INPUT(norm_w);
-    CHECK_INPUT(norm_b);
-
-    unsigned bsz = input.size(0);
-
-    const T* input_ptr = (const T*)input.data_ptr();
-    const T* input_mask_ptr = (const T*)input_mask.data_ptr();
-    const T* attn_qkvw_ptr = (const T*)attn_qkvw.data_ptr();
-    const T* attn_qkvb_ptr = (const T*)attn_qkvb.data_ptr();
-    const T* attn_ow_ptr = (const T*)attn_ow.data_ptr();
-    const T* attn_ob_ptr = (const T*)attn_ob.data_ptr();
-    const T* attn_nw_ptr = (const T*)attn_nw.data_ptr();
-    const T* attn_nb_ptr = (const T*)attn_nb.data_ptr();
-    const T* inter_w_ptr = (const T*)inter_w.data_ptr();
-    const T* inter_b_ptr = (const T*)inter_b.data_ptr();
-    const T* output_w_ptr = (const T*)output_w.data_ptr();
-    const T* output_b_ptr = (const T*)output_b.data_ptr();
-    const T* norm_w_ptr = (const T*)norm_w.data_ptr();
-    const T* norm_b_ptr = (const T*)norm_b.data_ptr();
-
-    auto output = torch::empty_like(input);
-    T* out_ptr = (T*)output.data_ptr();
-
-    auto options = torch::TensorOptions()
-                       .dtype(input.options().dtype())
-                       .layout(torch::kStrided)
-                       .device(torch::kCUDA)
-                       .requires_grad(true);
-
-    auto uint8_options = torch::TensorOptions()
-                             .dtype(torch::kInt8)
-                             .layout(torch::kStrided)
-                             .device(torch::kCUDA)
-                             .requires_grad(false);
-
-    std::shared_ptr<BertTransformerLayer<T>> layer =
-        std::static_pointer_cast<BertTransformerLayer<T>>(s_transformer_layers[layer_id]);
-
-    unsigned seq_len = layer->GetSeqLength();
-    if (input.size(1) != seq_len) {
-        seq_len = input.size(1);
-        layer->SetSeqLength(seq_len);
-    }
-
-    auto workspace = torch::empty({get_workspace_size<T>(bsz,
-                                                         seq_len,
-                                                         layer->GetHiddenSize(),
-                                                         layer->GetIntermediateSize(),
-                                                         layer->GetNumHeads(),
-                                                         layer->IsTrainingMode(),
-                                                         layer->GeluCheckpoint())},
-                                  options);
-    Context::Instance().SetWorkSpace((T*)workspace.data_ptr());
-
-    auto inp_norm = ((prelayernorm || !normalize_invertible) ? torch::empty_like(input) : output);
-    auto add_res = (normalize_invertible ? inp_norm : torch::empty_like(input));
-    auto attn_o_inp = torch::empty_like(input);
-    auto qkv_tf = torch::empty({(bsz * seq_len), output_w.size(0) * 3}, options);
-
-    auto attn_prob_dropout_mask =
-        torch::empty({(bsz * layer->GetNumHeads() * seq_len), seq_len}, uint8_options);
-    auto attn_output_dropout_mask =
-        torch::empty({(bsz * seq_len), layer->GetHiddenSize()}, uint8_options);
-    auto layer_output_dropout_mask =
-        torch::empty({(bsz * seq_len), layer->GetHiddenSize()}, uint8_options);
-
-    auto attn_layer_norm_var = torch::empty({(bsz * seq_len)}, options);
-    auto attn_layer_norm_mean = torch::empty({(bsz * seq_len)}, options);
-    auto layer_norm_var = torch::empty({(bsz * seq_len)}, options);
-    auto layer_norm_mean = torch::empty({(bsz * seq_len)}, options);
-
-    T* inp_norm_ptr = (T*)inp_norm.data_ptr();
-    T* add_res_ptr = (T*)add_res.data_ptr();
-    T* q_tf_ptr = (T*)qkv_tf.data_ptr();
-    T* k_tf_ptr = q_tf_ptr + (bsz * seq_len * output_w.size(0));  //(T*)k_tf.data_ptr();
-    T* v_tf_ptr = k_tf_ptr + (bsz * seq_len * output_w.size(0));  //(T*)v_tf.data_ptr();
-    T* attn_o_inp_ptr = (T*)attn_o_inp.data_ptr();
-
-    torch::Tensor ff2_inp = torch::empty({(bsz * seq_len), output_w.size(1)}, options);
-    torch::Tensor gelu_inp =
-        (gelu_checkpoint ? ff2_inp : torch::empty({(bsz * seq_len), output_w.size(1)}, options));
-    auto ff1_inp = torch::empty_like(input);
-    T* ff2_inp_ptr = (T*)ff2_inp.data_ptr();
-    T* gelu_inp_ptr = (T*)gelu_inp.data_ptr();
-    T* ff1_inp_ptr = (T*)ff1_inp.data_ptr();
-
-    torch::Tensor soft_out =
-        torch::empty({(bsz * layer->GetNumHeads() * seq_len), seq_len}, options);
-    torch::Tensor ctx_bufB =
-        (attn_dropout_checkpoint
-             ? soft_out
-             : torch::empty({(bsz * layer->GetNumHeads() * seq_len), seq_len}, options));
-    T* soft_out_ptr = (T*)soft_out.data_ptr();
-    T* ctx_bufB_ptr = (T*)ctx_bufB.data_ptr();
-
-    layer->SetTrainingMode(training_mode);
-    layer->SetIntermediateBuffers((uint8_t*)attn_prob_dropout_mask.data_ptr(),
-                                  (uint8_t*)attn_output_dropout_mask.data_ptr(),
-                                  (uint8_t*)layer_output_dropout_mask.data_ptr(),
-                                  (T*)attn_layer_norm_var.data_ptr(),
-                                  (T*)attn_layer_norm_mean.data_ptr(),
-                                  (T*)layer_norm_var.data_ptr(),
-                                  (T*)layer_norm_mean.data_ptr());
-
-    layer->Forward(bsz,
-                   input_ptr,
-                   input_mask_ptr,
-                   attn_qkvw_ptr,
-                   attn_qkvb_ptr,
-                   attn_ow_ptr,
-                   attn_ob_ptr,
-                   attn_nw_ptr,
-                   attn_nb_ptr,
-                   inter_w_ptr,
-                   inter_b_ptr,
-                   output_w_ptr,
-                   output_b_ptr,
-                   norm_w_ptr,
-                   norm_b_ptr,
-                   out_ptr,
-                   inp_norm_ptr,
-                   q_tf_ptr,
-                   k_tf_ptr,
-                   v_tf_ptr,
-                   soft_out_ptr,
-                   ctx_bufB_ptr,
-                   attn_o_inp_ptr,
-                   add_res_ptr,
-                   ff1_inp_ptr,
-                   gelu_inp_ptr,
-                   ff2_inp_ptr);
-
-    return {output,
-            inp_norm,
-            qkv_tf,
-            soft_out,
-            ctx_bufB,
-            attn_o_inp,
-            add_res,
-            ff1_inp,
-            gelu_inp,
-            ff2_inp,
-            attn_prob_dropout_mask,
-            attn_output_dropout_mask,
-            layer_output_dropout_mask,
-            attn_layer_norm_var,
-            attn_layer_norm_mean,
-            layer_norm_var,
-            layer_norm_mean};
-}
-
-template <typename T>
-std::vector<torch::Tensor> ds_transformer_backward(unsigned layer_id,
-                                                   const torch::Tensor& grad_output,
-                                                   const torch::Tensor& output,
-                                                   const torch::Tensor& inp_norm,
-                                                   const torch::Tensor& qkv_tf,
-                                                   const torch::Tensor& soft_out,
-                                                   const torch::Tensor& ctx_bufB,
-                                                   const torch::Tensor& attn_o_inp,
-                                                   const torch::Tensor& add_res,
-                                                   const torch::Tensor& ff1_inp,
-                                                   const torch::Tensor& gelu_inp,
-                                                   const torch::Tensor& ff2_inp,
-                                                   const torch::Tensor& attn_prob_dropout_mask,
-                                                   const torch::Tensor& attn_output_dropout_mask,
-                                                   const torch::Tensor& layer_output_dropout_mask,
-                                                   const torch::Tensor& attn_layer_norm_var,
-                                                   const torch::Tensor& attn_layer_norm_mean,
-                                                   const torch::Tensor& layer_norm_var,
-                                                   const torch::Tensor& layer_norm_mean,
-                                                   const torch::Tensor& input,
-                                                   const torch::Tensor& input_mask,
-                                                   const torch::Tensor& attn_qkvw,
-                                                   const torch::Tensor& attn_qkvb,
-                                                   const torch::Tensor& attn_ow,
-                                                   const torch::Tensor& attn_ob,
-                                                   const torch::Tensor& attn_nw,
-                                                   const torch::Tensor& attn_nb,
-                                                   const torch::Tensor& inter_w,
-                                                   const torch::Tensor& inter_b,
-                                                   const torch::Tensor& output_w,
-                                                   const torch::Tensor& output_b,
-                                                   const torch::Tensor& norm_w,
-                                                   const torch::Tensor& norm_b)
-{
-    auto g_output = grad_output.contiguous();
-    CHECK_INPUT(g_output);
-    CHECK_INPUT(output);
-    CHECK_INPUT(inp_norm);
-    CHECK_INPUT(qkv_tf);
-    CHECK_INPUT(add_res);
-    CHECK_INPUT(soft_out);
-    CHECK_INPUT(ctx_bufB);
-    CHECK_INPUT(attn_o_inp);
-    CHECK_INPUT(ff1_inp);
-    CHECK_INPUT(gelu_inp);
-    CHECK_INPUT(ff2_inp);
-    CHECK_INPUT(input);
-    CHECK_INPUT(input_mask);
-    CHECK_INPUT(attn_qkvw);
-    CHECK_INPUT(attn_qkvb);
-    CHECK_INPUT(attn_ow);
-    CHECK_INPUT(attn_ob);
-    CHECK_INPUT(attn_nw);
-    CHECK_INPUT(attn_nb);
-    CHECK_INPUT(inter_w);
-    CHECK_INPUT(inter_b);
-    CHECK_INPUT(output_w);
-    CHECK_INPUT(output_b);
-    CHECK_INPUT(norm_w);
-    CHECK_INPUT(norm_b);
-
-    unsigned bsz = g_output.size(0);
-
-    std::shared_ptr<BertTransformerLayer<T>> layer =
-        std::static_pointer_cast<BertTransformerLayer<T>>(s_transformer_layers[layer_id]);
-
-    unsigned seq_len = layer->GetSeqLength();
-    if (g_output.size(1) != seq_len) {
-        seq_len = g_output.size(1);
-        layer->SetSeqLength(seq_len);
-    }
-    auto options = torch::TensorOptions()
-                       .dtype(g_output.options().dtype())
-                       .layout(torch::kStrided)
-                       .device(torch::kCUDA)
-                       .requires_grad(true);
-    auto workspace = torch::empty({get_workspace_size<T>(bsz,
-                                                         seq_len,
-                                                         layer->GetHiddenSize(),
-                                                         layer->GetIntermediateSize(),
-                                                         layer->GetNumHeads(),
-                                                         layer->IsTrainingMode(),
-                                                         layer->GeluCheckpoint())},
-                                  options);
-    Context::Instance().SetWorkSpace((T*)workspace.data_ptr());
-
-    auto grad_input = torch::empty_like(input);
-    auto grad_attn_qkvw = torch::empty_like(attn_qkvw);
-    auto grad_attn_qkvb = torch::empty_like(attn_qkvb);
-    auto grad_attn_ow = torch::empty_like(attn_ow);
-    auto grad_attn_ob = torch::empty_like(attn_ob);
-    auto grad_attn_nw = torch::empty_like(attn_nw);
-    auto grad_attn_nb = torch::empty_like(attn_nb);
-    auto grad_inter_w = torch::empty_like(inter_w);
-    auto grad_inter_b = torch::empty_like(inter_b);
-    auto grad_output_w = torch::empty_like(output_w);
-    auto grad_output_b = torch::empty_like(output_b);
-    auto grad_norm_w = torch::empty_like(norm_w);
-    auto grad_norm_b = torch::empty_like(norm_b);
-
-    // inputs.
-    const T* grad_output_ptr = (const T*)g_output.data_ptr();
-    const T* input_ptr = (const T*)input.data_ptr();
-    const T* output_ptr = (const T*)output.data_ptr();
-    const T* inp_norm_ptr = (const T*)inp_norm.data_ptr();
-    const T* q_tf_ptr = (const T*)qkv_tf.data_ptr();
-    const T* add_res_ptr = (const T*)add_res.data_ptr();
-    const T* k_tf_ptr =
-        q_tf_ptr + (bsz * layer->GetSeqLength() * output_w.size(0));  //(const T*)k_tf.data_ptr();
-    const T* v_tf_ptr =
-        k_tf_ptr + (bsz * layer->GetSeqLength() * output_w.size(0));  //(const T*)v_tf.data_ptr();
-    const T* ff1_inp_ptr = (const T*)ff1_inp.data_ptr();
-    const T* gelu_inp_ptr = (const T*)gelu_inp.data_ptr();
-    const T* ff2_inp_ptr = (const T*)ff2_inp.data_ptr();
-    const T* ctx_bufB_ptr = (const T*)ctx_bufB.data_ptr();
-    const T* soft_out_ptr = (const T*)soft_out.data_ptr();
-    const T* attn_o_inp_ptr = (const T*)attn_o_inp.data_ptr();
-    const T* input_mask_ptr = (const T*)input_mask.data_ptr();
-    const T* attn_qkvw_ptr = (const T*)attn_qkvw.data_ptr();
-    const T* attn_ow_ptr = (const T*)attn_ow.data_ptr();
-    const T* attn_nw_ptr = (const T*)attn_nw.data_ptr();
-    const T* attn_nb_ptr = (const T*)attn_nb.data_ptr();
-    const T* inter_w_ptr = (const T*)inter_w.data_ptr();
-    const T* inter_b_ptr = (const T*)inter_b.data_ptr();
-    const T* output_w_ptr = (const T*)output_w.data_ptr();
-    const T* norm_w_ptr = (const T*)norm_w.data_ptr();
-    const T* norm_b_ptr = (const T*)norm_b.data_ptr();
-
-    // outputs.
-    T* grad_input_ptr = (T*)grad_input.data_ptr();
-    T* grad_attn_qkvw_ptr = (T*)grad_attn_qkvw.data_ptr();
-    T* grad_attn_qkvb_ptr = (T*)grad_attn_qkvb.data_ptr();
-    T* grad_attn_ow_ptr = (T*)grad_attn_ow.data_ptr();
-    T* grad_attn_ob_ptr = (T*)grad_attn_ob.data_ptr();
-    T* grad_attn_nw_ptr = (T*)grad_attn_nw.data_ptr();
-    T* grad_attn_nb_ptr = (T*)grad_attn_nb.data_ptr();
-    T* grad_inter_w_ptr = (T*)grad_inter_w.data_ptr();
-    T* grad_inter_b_ptr = (T*)grad_inter_b.data_ptr();
-    T* grad_output_w_ptr = (T*)grad_output_w.data_ptr();
-    T* grad_output_b_ptr = (T*)grad_output_b.data_ptr();
-    T* grad_norm_w_ptr = (T*)grad_norm_w.data_ptr();
-    T* grad_norm_b_ptr = (T*)grad_norm_b.data_ptr();
-
-    layer->SetIntermediateBuffers((uint8_t*)attn_prob_dropout_mask.data_ptr(),
-                                  (uint8_t*)attn_output_dropout_mask.data_ptr(),
-                                  (uint8_t*)layer_output_dropout_mask.data_ptr(),
-                                  (T*)attn_layer_norm_var.data_ptr(),
-                                  (T*)attn_layer_norm_mean.data_ptr(),
-                                  (T*)layer_norm_var.data_ptr(),
-                                  (T*)layer_norm_mean.data_ptr());
-
-    layer->Backward(bsz,
-                    grad_output_ptr,
-                    input_ptr,
-                    output_ptr,
-                    inp_norm_ptr,
-                    q_tf_ptr,
-                    k_tf_ptr,
-                    v_tf_ptr,
-                    soft_out_ptr,
-                    ctx_bufB_ptr,
-                    attn_o_inp_ptr,
-                    add_res_ptr,
-                    ff1_inp_ptr,
-                    gelu_inp_ptr,
-                    ff2_inp_ptr,
-                    input_mask_ptr,
-                    attn_qkvw_ptr,
-                    attn_ow_ptr,
-                    attn_nw_ptr,
-                    attn_nb_ptr,
-                    inter_w_ptr,
-                    inter_b_ptr,
-                    output_w_ptr,
-                    norm_w_ptr,
-                    norm_b_ptr,
-
-                    grad_input_ptr,
-                    grad_attn_qkvw_ptr,
-                    grad_attn_qkvb_ptr,
-                    grad_attn_ow_ptr,
-                    grad_attn_ob_ptr,
-                    grad_attn_nw_ptr,
-                    grad_attn_nb_ptr,
-                    grad_inter_w_ptr,
-                    grad_inter_b_ptr,
-                    grad_output_w_ptr,
-                    grad_output_b_ptr,
-                    grad_norm_w_ptr,
-                    grad_norm_b_ptr);
-
-    return {grad_input,
-            grad_attn_qkvw,
-            grad_attn_qkvb,
-            grad_attn_ow,
-            grad_attn_ob,
-            grad_attn_nw,
-            grad_attn_nb,
-            grad_inter_w,
-            grad_inter_b,
-            grad_output_w,
-            grad_output_b,
-            grad_norm_w,
-            grad_norm_b};
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("forward_fp32",
-          &ds_transformer_forward<float>,
-          "DeepSpeed Transformer forward with fp32 (CUDA)");
-    m.def("forward_fp16",
-          &ds_transformer_forward<__half>,
-          "DeepSpeed Transformer forward with fp16 (CUDA)");
-    m.def("backward_fp32",
-          &ds_transformer_backward<float>,
-          "DeepSpeed Transformer backward with fp32 (CUDA)");
-    m.def("backward_fp16",
-          &ds_transformer_backward<__half>,
-          "DeepSpeed Transformer backward with fp16 (CUDA)");
-    m.def("create_transformer_layer_fp32",
-          &create_transformer_layer<float>,
-          "Create DeepSpeed Transformer Transformer Layer with fp32 (CUDA)");
-    m.def("create_transformer_layer_fp16",
-          &create_transformer_layer<__half>,
-          "Create DeepSpeed Transformer Transformer Layer with fp16 (CUDA)");
-}
diff --git a/deepspeed/ops/csrc/transformer_bak/gelu_kernels.cu b/deepspeed/ops/csrc/transformer_bak/gelu_kernels.cu
deleted file mode 100644
index d683cf0af83daf829e390d83182eb6f0ffd040a6..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/gelu_kernels.cu
+++ /dev/null
@@ -1,330 +0,0 @@
-#include "custom_cuda_layers.h"
-
-inline __device__ float gelu(const float x)
-{
-    const float sqrt_param = 0.79788456080286535587989211986876f;
-    const float mul_param = 0.044715;
-    return x * 0.5f * (1.0f + tanhf(sqrt_param * (x + mul_param * x * x * x)));
-}
-
-inline __device__ float d_gelu(const float x)
-{
-    const float sqrt_param = 0.79788456080286535587989211986876f;
-    const float mul_param = 0.044715;
-
-    float x2mul = x * x * mul_param;
-    float tan_h = tanhf(sqrt_param * (x + x * x2mul));
-    float dg1 = 0.5f * (1.0f + tan_h);
-    float dg2 = x * 0.5f * sqrt_param * (1 - tan_h * tan_h);
-    float dg3 = dg2 * 3 * x2mul;
-    return (dg1 + dg2 + dg3);
-}
-
-/*
-Fused bias add with GELU
-
-Loads a vector of 4 elements each iteration, for stride
-iterations. It was written with the intention to launch 256 thread
-threadblocks, so to launch for bert-large, we would set ITERATIONS
-to 4. This is currently done automatically as a heuristic, setting
-the number of iterations as blocks of 1024.
-
-For FP16, the values are loaded from memory as __half, but converted
-to FP32 for the arithmetic itself, to prevent numerous overflow on
-the intermediate hyperbolic tangent, since there's no intrinsic
-that computes it directly.
-*/
-
-__global__ void gelu_kernel(const float* input, float* vals, int row_stride, int iterations)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    const float4* input_cast = reinterpret_cast<const float4*>(input);
-    float4* vals_cast = reinterpret_cast<float4*>(vals);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float4 data = input_cast[row * row_stride + i * loop_stride + id];
-
-            data.x = gelu(data.x);
-            data.y = gelu(data.y);
-            data.z = gelu(data.z);
-            data.w = gelu(data.w);
-
-            vals_cast[row * row_stride + i * loop_stride + id] = data;
-        }
-    }
-}
-
-__global__ void gelu_kernel(const __half* input, __half* vals, int row_stride, int iterations)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    const float2* input_cast = reinterpret_cast<const float2*>(input);
-    float2* vals_cast = reinterpret_cast<float2*>(vals);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float2 vals_vec = input_cast[row * row_stride + i * loop_stride + id];
-
-            __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-
-            float2 low_data = __half22float2(vals_half[0]);
-            float2 high_data = __half22float2(vals_half[1]);
-
-            low_data.x = gelu(low_data.x);
-            low_data.y = gelu(low_data.y);
-            high_data.x = gelu(high_data.x);
-            high_data.y = gelu(high_data.y);
-
-            vals_half[0] = __float22half2_rn(low_data);
-            vals_half[1] = __float22half2_rn(high_data);
-
-            vals_cast[row * row_stride + i * loop_stride + id] = vals_vec;
-        }
-    }
-#endif
-}
-
-__global__ void fused_bias_gelu(const float* input,
-                                const float* bias,
-                                float* vals,
-                                int row_stride,
-                                int iterations)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    const float4* input_cast = reinterpret_cast<const float4*>(input);
-    float4* vals_cast = reinterpret_cast<float4*>(vals);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float4 data = input_cast[row * row_stride + i * loop_stride + id];
-            float4 bias_data = bias_cast[i * loop_stride + id];
-
-            data.x += bias_data.x;
-            data.y += bias_data.y;
-            data.z += bias_data.z;
-            data.w += bias_data.w;
-
-            data.x = gelu(data.x);
-            data.y = gelu(data.y);
-            data.z = gelu(data.z);
-            data.w = gelu(data.w);
-
-            vals_cast[row * row_stride + i * loop_stride + id] = data;
-        }
-    }
-}
-
-__global__ void fused_bias_gelu(const __half* input,
-                                const __half* bias,
-                                __half* vals,
-                                int row_stride,
-                                int iterations)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    const float2* input_cast = reinterpret_cast<const float2*>(input);
-    float2* vals_cast = reinterpret_cast<float2*>(vals);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float2 vals_vec = input_cast[row * row_stride + i * loop_stride + id];
-            float2 bias_vec = bias_cast[i * loop_stride + id];
-
-            __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-            __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-            float2 low_data = __half22float2(vals_half[0]);
-            float2 high_data = __half22float2(vals_half[1]);
-
-            float2 low_bias = __half22float2(bias_half[0]);
-            float2 high_bias = __half22float2(bias_half[1]);
-
-            low_data.x += low_bias.x;
-            low_data.y += low_bias.y;
-            high_data.x += high_bias.x;
-            high_data.y += high_bias.y;
-
-            low_data.x = gelu(low_data.x);
-            low_data.y = gelu(low_data.y);
-            high_data.x = gelu(high_data.x);
-            high_data.y = gelu(high_data.y);
-
-            vals_half[0] = __float22half2_rn(low_data);
-            vals_half[1] = __float22half2_rn(high_data);
-
-            vals_cast[row * row_stride + i * loop_stride + id] = vals_vec;
-        }
-    }
-#endif
-}
-
-__global__ void d_gelu_func(float* d_output,
-                            const float* gelu_input,
-                            const float* bias,
-                            int row_stride,
-                            int iterations)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    float4* d_output_cast = reinterpret_cast<float4*>(d_output);
-    const float4* gelu_input_cast = reinterpret_cast<const float4*>(gelu_input);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float4 output_data = d_output_cast[row * row_stride + i * loop_stride + id];
-            float4 gelu_input_data = gelu_input_cast[row * row_stride + i * loop_stride + id];
-            float4 bias_data = bias_cast[i * loop_stride + id];
-
-            gelu_input_data.x += bias_data.x;
-            gelu_input_data.y += bias_data.y;
-            gelu_input_data.z += bias_data.z;
-            gelu_input_data.w += bias_data.w;
-
-            output_data.x *= d_gelu(gelu_input_data.x);
-            output_data.y *= d_gelu(gelu_input_data.y);
-            output_data.z *= d_gelu(gelu_input_data.z);
-            output_data.w *= d_gelu(gelu_input_data.w);
-
-            d_output_cast[row * row_stride + i * loop_stride + id] = output_data;
-        }
-    }
-}
-
-__global__ void d_gelu_func(__half* d_output,
-                            const __half* gelu_input,
-                            const __half* bias,
-                            int row_stride,
-                            int iterations)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    float2* d_output_cast = reinterpret_cast<float2*>(d_output);
-    const float2* gelu_input_cast = reinterpret_cast<const float2*>(gelu_input);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float2 output_data = d_output_cast[row * row_stride + i * loop_stride + id];
-            float2 gelu_input_data = gelu_input_cast[row * row_stride + i * loop_stride + id];
-            float2 bias_vec = bias_cast[i * loop_stride + id];
-
-            __half2* output_data_half = reinterpret_cast<__half2*>(&output_data);
-            __half2* gelu_input_data_half = reinterpret_cast<__half2*>(&gelu_input_data);
-            __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-            float2 output_half_0 = __half22float2(output_data_half[0]);
-            float2 output_half_1 = __half22float2(output_data_half[1]);
-
-            float2 gelu_input_half_0 = __half22float2(gelu_input_data_half[0]);
-            float2 gelu_input_half_1 = __half22float2(gelu_input_data_half[1]);
-
-            float2 bias_half_0 = __half22float2(bias_half[0]);
-            float2 bias_half_1 = __half22float2(bias_half[1]);
-
-            gelu_input_half_0.x += bias_half_0.x;
-            gelu_input_half_0.y += bias_half_0.y;
-            gelu_input_half_1.x += bias_half_1.x;
-            gelu_input_half_1.y += bias_half_1.y;
-
-            output_half_0.x *= d_gelu(gelu_input_half_0.x);
-            output_half_0.y *= d_gelu(gelu_input_half_0.y);
-            output_half_1.x *= d_gelu(gelu_input_half_1.x);
-            output_half_1.y *= d_gelu(gelu_input_half_1.y);
-
-            float2 result;
-            __half2* result_half2 = reinterpret_cast<__half2*>(&result);
-
-            result_half2[0] = __float22half2_rn(output_half_0);
-            result_half2[1] = __float22half2_rn(output_half_1);
-
-            d_output_cast[row * row_stride + i * loop_stride + id] = result;
-        }
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_gelu(const T* input,
-                      const T* bias,
-                      T* output,
-                      int intermediate_size,
-                      int batch_size,
-                      cudaStream_t stream)
-{
-    int iterations = (intermediate_size + 1023) / 1024;
-    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
-    dim3 block_dims(threads);
-    dim3 grid_dims(batch_size);
-
-    fused_bias_gelu<<<grid_dims, block_dims, 0, stream>>>(
-        input, bias, output, intermediate_size / 4, iterations);
-}
-
-template <typename T>
-void launch_gelu(const T* input,
-                 T* output,
-                 int intermediate_size,
-                 int batch_size,
-                 cudaStream_t stream)
-{
-    int iterations = (intermediate_size + 1023) / 1024;
-    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
-    dim3 block_dims(threads);
-    dim3 grid_dims(batch_size);
-
-    gelu_kernel<<<grid_dims, block_dims, 0, stream>>>(
-        input, output, intermediate_size / 4, iterations);
-}
-
-template void launch_bias_gelu<float>(const float*, const float*, float*, int, int, cudaStream_t);
-template void launch_bias_gelu<__half>(const __half*,
-                                       const __half*,
-                                       __half*,
-                                       int,
-                                       int,
-                                       cudaStream_t);
-
-template void launch_gelu<float>(const float*, float*, int, int, cudaStream_t);
-template void launch_gelu<__half>(const __half*, __half*, int, int, cudaStream_t);
-
-template <typename T>
-void launch_d_gelu(T* d_output,
-                   const T* input,
-                   const T* bias,
-                   int intermediate_size,
-                   int batch_size,
-                   cudaStream_t stream)
-{
-    int iterations = (intermediate_size + 1023) / 1024;
-    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
-    dim3 block_dims(threads);
-    dim3 grid_dims(batch_size);
-
-    d_gelu_func<<<grid_dims, block_dims, 0, stream>>>(
-        d_output, input, bias, intermediate_size / 4, iterations);
-}
-
-template void launch_d_gelu<float>(float*, const float*, const float*, int, int, cudaStream_t);
-template void launch_d_gelu<__half>(__half*, const __half*, const __half*, int, int, cudaStream_t);
diff --git a/deepspeed/ops/csrc/transformer_bak/gelu_kernels.hip b/deepspeed/ops/csrc/transformer_bak/gelu_kernels.hip
deleted file mode 100644
index f7e7a7fa7e0f22a6d20de4a1fbb5c2071afb9c77..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/gelu_kernels.hip
+++ /dev/null
@@ -1,332 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-
-inline __device__ float gelu(const float x)
-{
-    const float sqrt_param = 0.79788456080286535587989211986876f;
-    const float mul_param = 0.044715;
-    return x * 0.5f * (1.0f + tanhf(sqrt_param * (x + mul_param * x * x * x)));
-}
-
-inline __device__ float d_gelu(const float x)
-{
-    const float sqrt_param = 0.79788456080286535587989211986876f;
-    const float mul_param = 0.044715;
-
-    float x2mul = x * x * mul_param;
-    float tan_h = tanhf(sqrt_param * (x + x * x2mul));
-    float dg1 = 0.5f * (1.0f + tan_h);
-    float dg2 = x * 0.5f * sqrt_param * (1 - tan_h * tan_h);
-    float dg3 = dg2 * 3 * x2mul;
-    return (dg1 + dg2 + dg3);
-}
-
-/*
-Fused bias add with GELU
-
-Loads a vector of 4 elements each iteration, for stride
-iterations. It was written with the intention to launch 256 thread
-threadblocks, so to launch for bert-large, we would set ITERATIONS
-to 4. This is currently done automatically as a heuristic, setting
-the number of iterations as blocks of 1024.
-
-For FP16, the values are loaded from memory as __half, but converted
-to FP32 for the arithmetic itself, to prevent numerous overflow on
-the intermediate hyperbolic tangent, since there's no intrinsic
-that computes it directly.
-*/
-
-__global__ void gelu_kernel(const float* input, float* vals, int row_stride, int iterations)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    const float4* input_cast = reinterpret_cast<const float4*>(input);
-    float4* vals_cast = reinterpret_cast<float4*>(vals);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float4 data = input_cast[row * row_stride + i * loop_stride + id];
-
-            data.x = gelu(data.x);
-            data.y = gelu(data.y);
-            data.z = gelu(data.z);
-            data.w = gelu(data.w);
-
-            vals_cast[row * row_stride + i * loop_stride + id] = data;
-        }
-    }
-}
-
-__global__ void gelu_kernel(const __half* input, __half* vals, int row_stride, int iterations)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    const float2* input_cast = reinterpret_cast<const float2*>(input);
-    float2* vals_cast = reinterpret_cast<float2*>(vals);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float2 vals_vec = input_cast[row * row_stride + i * loop_stride + id];
-
-            __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-
-            float2 low_data = __half22float2(vals_half[0]);
-            float2 high_data = __half22float2(vals_half[1]);
-
-            low_data.x = gelu(low_data.x);
-            low_data.y = gelu(low_data.y);
-            high_data.x = gelu(high_data.x);
-            high_data.y = gelu(high_data.y);
-
-            vals_half[0] = __float22half2_rn(low_data);
-            vals_half[1] = __float22half2_rn(high_data);
-
-            vals_cast[row * row_stride + i * loop_stride + id] = vals_vec;
-        }
-    }
-#endif
-}
-
-__global__ void fused_bias_gelu(const float* input,
-                                const float* bias,
-                                float* vals,
-                                int row_stride,
-                                int iterations)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    const float4* input_cast = reinterpret_cast<const float4*>(input);
-    float4* vals_cast = reinterpret_cast<float4*>(vals);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float4 data = input_cast[row * row_stride + i * loop_stride + id];
-            float4 bias_data = bias_cast[i * loop_stride + id];
-
-            data.x += bias_data.x;
-            data.y += bias_data.y;
-            data.z += bias_data.z;
-            data.w += bias_data.w;
-
-            data.x = gelu(data.x);
-            data.y = gelu(data.y);
-            data.z = gelu(data.z);
-            data.w = gelu(data.w);
-
-            vals_cast[row * row_stride + i * loop_stride + id] = data;
-        }
-    }
-}
-
-__global__ void fused_bias_gelu(const __half* input,
-                                const __half* bias,
-                                __half* vals,
-                                int row_stride,
-                                int iterations)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    const float2* input_cast = reinterpret_cast<const float2*>(input);
-    float2* vals_cast = reinterpret_cast<float2*>(vals);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float2 vals_vec = input_cast[row * row_stride + i * loop_stride + id];
-            float2 bias_vec = bias_cast[i * loop_stride + id];
-
-            __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-            __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-            float2 low_data = __half22float2(vals_half[0]);
-            float2 high_data = __half22float2(vals_half[1]);
-
-            float2 low_bias = __half22float2(bias_half[0]);
-            float2 high_bias = __half22float2(bias_half[1]);
-
-            low_data.x += low_bias.x;
-            low_data.y += low_bias.y;
-            high_data.x += high_bias.x;
-            high_data.y += high_bias.y;
-
-            low_data.x = gelu(low_data.x);
-            low_data.y = gelu(low_data.y);
-            high_data.x = gelu(high_data.x);
-            high_data.y = gelu(high_data.y);
-
-            vals_half[0] = __float22half2_rn(low_data);
-            vals_half[1] = __float22half2_rn(high_data);
-
-            vals_cast[row * row_stride + i * loop_stride + id] = vals_vec;
-        }
-    }
-#endif
-}
-
-__global__ void d_gelu_func(float* d_output,
-                            const float* gelu_input,
-                            const float* bias,
-                            int row_stride,
-                            int iterations)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    float4* d_output_cast = reinterpret_cast<float4*>(d_output);
-    const float4* gelu_input_cast = reinterpret_cast<const float4*>(gelu_input);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float4 output_data = d_output_cast[row * row_stride + i * loop_stride + id];
-            float4 gelu_input_data = gelu_input_cast[row * row_stride + i * loop_stride + id];
-            float4 bias_data = bias_cast[i * loop_stride + id];
-
-            gelu_input_data.x += bias_data.x;
-            gelu_input_data.y += bias_data.y;
-            gelu_input_data.z += bias_data.z;
-            gelu_input_data.w += bias_data.w;
-
-            output_data.x *= d_gelu(gelu_input_data.x);
-            output_data.y *= d_gelu(gelu_input_data.y);
-            output_data.z *= d_gelu(gelu_input_data.z);
-            output_data.w *= d_gelu(gelu_input_data.w);
-
-            d_output_cast[row * row_stride + i * loop_stride + id] = output_data;
-        }
-    }
-}
-
-__global__ void d_gelu_func(__half* d_output,
-                            const __half* gelu_input,
-                            const __half* bias,
-                            int row_stride,
-                            int iterations)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    float2* d_output_cast = reinterpret_cast<float2*>(d_output);
-    const float2* gelu_input_cast = reinterpret_cast<const float2*>(gelu_input);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float2 output_data = d_output_cast[row * row_stride + i * loop_stride + id];
-            float2 gelu_input_data = gelu_input_cast[row * row_stride + i * loop_stride + id];
-            float2 bias_vec = bias_cast[i * loop_stride + id];
-
-            __half2* output_data_half = reinterpret_cast<__half2*>(&output_data);
-            __half2* gelu_input_data_half = reinterpret_cast<__half2*>(&gelu_input_data);
-            __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-            float2 output_half_0 = __half22float2(output_data_half[0]);
-            float2 output_half_1 = __half22float2(output_data_half[1]);
-
-            float2 gelu_input_half_0 = __half22float2(gelu_input_data_half[0]);
-            float2 gelu_input_half_1 = __half22float2(gelu_input_data_half[1]);
-
-            float2 bias_half_0 = __half22float2(bias_half[0]);
-            float2 bias_half_1 = __half22float2(bias_half[1]);
-
-            gelu_input_half_0.x += bias_half_0.x;
-            gelu_input_half_0.y += bias_half_0.y;
-            gelu_input_half_1.x += bias_half_1.x;
-            gelu_input_half_1.y += bias_half_1.y;
-
-            output_half_0.x *= d_gelu(gelu_input_half_0.x);
-            output_half_0.y *= d_gelu(gelu_input_half_0.y);
-            output_half_1.x *= d_gelu(gelu_input_half_1.x);
-            output_half_1.y *= d_gelu(gelu_input_half_1.y);
-
-            float2 result;
-            __half2* result_half2 = reinterpret_cast<__half2*>(&result);
-
-            result_half2[0] = __float22half2_rn(output_half_0);
-            result_half2[1] = __float22half2_rn(output_half_1);
-
-            d_output_cast[row * row_stride + i * loop_stride + id] = result;
-        }
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_gelu(const T* input,
-                      const T* bias,
-                      T* output,
-                      int intermediate_size,
-                      int batch_size,
-                      hipStream_t stream)
-{
-    int iterations = (intermediate_size + 1023) / 1024;
-    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
-    dim3 block_dims(threads);
-    dim3 grid_dims(batch_size);
-
-   hipLaunchKernelGGL(( fused_bias_gelu), dim3(grid_dims), dim3(block_dims), 0, stream, 
-        input, bias, output, intermediate_size / 4, iterations);
-}
-
-template <typename T>
-void launch_gelu(const T* input,
-                 T* output,
-                 int intermediate_size,
-                 int batch_size,
-                 hipStream_t stream)
-{
-    int iterations = (intermediate_size + 1023) / 1024;
-    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
-    dim3 block_dims(threads);
-    dim3 grid_dims(batch_size);
-
-   hipLaunchKernelGGL(( gelu_kernel), dim3(grid_dims), dim3(block_dims), 0, stream, 
-        input, output, intermediate_size / 4, iterations);
-}
-
-template void launch_bias_gelu<float>(const float*, const float*, float*, int, int, hipStream_t);
-template void launch_bias_gelu<__half>(const __half*,
-                                       const __half*,
-                                       __half*,
-                                       int,
-                                       int,
-                                       hipStream_t);
-
-template void launch_gelu<float>(const float*, float*, int, int, hipStream_t);
-template void launch_gelu<__half>(const __half*, __half*, int, int, hipStream_t);
-
-template <typename T>
-void launch_d_gelu(T* d_output,
-                   const T* input,
-                   const T* bias,
-                   int intermediate_size,
-                   int batch_size,
-                   hipStream_t stream)
-{
-    int iterations = (intermediate_size + 1023) / 1024;
-    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
-    dim3 block_dims(threads);
-    dim3 grid_dims(batch_size);
-
-   hipLaunchKernelGGL(( d_gelu_func), dim3(grid_dims), dim3(block_dims), 0, stream, 
-        d_output, input, bias, intermediate_size / 4, iterations);
-}
-
-template void launch_d_gelu<float>(float*, const float*, const float*, int, int, hipStream_t);
-template void launch_d_gelu<__half>(__half*, const __half*, const __half*, int, int, hipStream_t);
diff --git a/deepspeed/ops/csrc/transformer_bak/general_kernels.cu b/deepspeed/ops/csrc/transformer_bak/general_kernels.cu
deleted file mode 100644
index 1eaa94e1e71a40bf44b661656395cf1ed087f589..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/general_kernels.cu
+++ /dev/null
@@ -1,411 +0,0 @@
-#include "general_kernels.h"
-
-namespace cg = cooperative_groups;
-
-template <typename T>
-__global__ void column_sum_reduce(const T* __restrict__ inp,
-                                  T* __restrict__ out,
-                                  int rows,
-                                  int width)
-{
-    __shared__ float tile[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-
-    int y_stride = width * TILE_DIM;
-
-    float localSum = 0;
-
-    // Loop across matrix height
-    if (idx < width) {
-        int offset = threadIdx.y * width + idx;
-        for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-            localSum += (float)inp[offset];
-            offset += y_stride;
-        }
-    }
-
-    tile[threadIdx.x][threadIdx.y] = localSum;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float sum = tile[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) sum += g.shfl_down(sum, i);
-
-    if (threadIdx.x == 0) {
-        int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-        if (pos < width) out[pos] = sum;
-    }
-}
-
-template <typename T>
-void launch_fuse_transpose_bias_kernel(const T* inp,
-                                       T* out,
-                                       int rows,
-                                       int cols,
-                                       cudaStream_t stream);
-
-template <>
-void launch_fuse_transpose_bias_kernel<float>(const float* inp,
-                                              float* out,
-                                              int rows,
-                                              int cols,
-                                              cudaStream_t stream)
-{
-    // assert(rows % TILE_DIM == 0);
-    // assert(cols % TILE_DIM == 0);
-
-    dim3 grid_dim((cols - 1) / TILE_DIM + 1);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    column_sum_reduce<float><<<grid_dim, block_dim, 0, stream>>>(inp, out, rows, cols);
-}
-
-template <>
-void launch_fuse_transpose_bias_kernel<__half>(const __half* inp,
-                                               __half* out,
-                                               int rows,
-                                               int cols,
-                                               cudaStream_t stream)
-{
-    // assert(rows % TILE_DIM == 0);
-    // assert(cols % TILE_DIM == 0);
-
-    dim3 grid_dim((cols - 1) / TILE_DIM + 1);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    column_sum_reduce<__half><<<grid_dim, block_dim, 0, stream>>>(inp, out, rows, cols);
-}
-
-__global__ void fused_add2_kernel(const int N, float* out, const float* inp1, const float* inp2)
-{
-    const float4* inp1_4 = reinterpret_cast<const float4*>(inp1);
-    const float4* inp2_4 = reinterpret_cast<const float4*>(inp2);
-    float4* out_4 = reinterpret_cast<float4*>(out);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 val;
-        float4 inp1_reg = inp1_4[j];
-        float4 inp2_reg = inp2_4[j];
-
-        val.x = inp1_reg.x + inp2_reg.x;
-        val.y = inp1_reg.y + inp2_reg.y;
-        val.z = inp1_reg.z + inp2_reg.z;
-        val.w = inp1_reg.w + inp2_reg.w;
-
-        out_4[j] = val;
-    }
-}
-
-__global__ void fused_add2_kernel(const int N, __half* out, const __half* inp1, const __half* inp2)
-{
-    float2 inp1_4;
-    float2 inp2_4;
-
-    __half2* inp1_h = reinterpret_cast<__half2*>(&inp1_4);
-    __half2* inp2_h = reinterpret_cast<__half2*>(&inp2_4);
-
-    const float2* inp1_arr = reinterpret_cast<const float2*>(inp1);
-    const float2* inp2_arr = reinterpret_cast<const float2*>(inp2);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        inp1_4 = inp1_arr[j];
-        inp2_4 = inp2_arr[j];
-
-        float2 inp1_h_f_0 = __half22float2(inp1_h[0]);
-        float2 inp1_h_f_1 = __half22float2(inp1_h[1]);
-
-        float2 inp2_h_f_0 = __half22float2(inp2_h[0]);
-        float2 inp2_h_f_1 = __half22float2(inp2_h[1]);
-
-        inp1_h_f_0.x += inp2_h_f_0.x;
-        inp1_h_f_0.y += inp2_h_f_0.y;
-        inp1_h_f_1.x += inp2_h_f_1.x;
-        inp1_h_f_1.y += inp2_h_f_1.y;
-
-        float2 val_f;
-        __half2* val_h = reinterpret_cast<__half2*>(&val_f);
-
-        val_h[0] = __float22half2_rn(inp1_h_f_0);
-        val_h[1] = __float22half2_rn(inp1_h_f_1);
-
-        float2* out_4 = reinterpret_cast<float2*>(out);
-        out_4[j] = val_f;
-    }
-}
-
-template <>
-void launch_fused_add2<float>(float* out,
-                              const float* inp1,
-                              const float* inp2,
-                              int batch_size,
-                              int seq_length,
-                              int hidden_dim,
-                              cudaStream_t& stream)
-{
-    int total_count = batch_size * seq_length * hidden_dim / 4;
-    dim3 grid_dim = DS_GET_BLOCKS(total_count);  //(batch_size * seq_length);
-
-    dim3 block_dim = DS_CUDA_NUM_THREADS;  //(hidden_dim / 4);
-
-    fused_add2_kernel<<<grid_dim, block_dim, 0, stream>>>(total_count, out, inp1, inp2);
-}
-
-template <>
-void launch_fused_add2<__half>(__half* out,
-                               const __half* inp1,
-                               const __half* inp2,
-                               int batch_size,
-                               int seq_length,
-                               int hidden_dim,
-                               cudaStream_t& stream)
-{
-    int total_count = batch_size * seq_length * hidden_dim / 4;
-    dim3 grid_dim = DS_GET_BLOCKS(total_count);  //(batch_size * seq_length);
-
-    dim3 block_dim = DS_CUDA_NUM_THREADS;  //(hidden_dim / 4);
-
-    fused_add2_kernel<<<grid_dim, block_dim, 0, stream>>>(total_count, out, inp1, inp2);
-}
-
-__global__ void fused_add3_kernel(float* out,
-                                  const float* inp1,
-                                  const float* inp2,
-                                  const float* inp3,
-                                  int size,
-                                  int row_stride)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-
-    const float4* inp1_4 = reinterpret_cast<const float4*>(inp1);
-    const float4* inp2_4 = reinterpret_cast<const float4*>(inp2);
-    const float4* inp3_4 = reinterpret_cast<const float4*>(inp3);
-
-    float4* out_4 = reinterpret_cast<float4*>(out);
-
-    float4 val;
-    float4 inp1_reg = inp1_4[row * row_stride + id];
-    float4 inp2_reg = inp2_4[row * row_stride + id];
-    float4 inp3_reg = inp3_4[row * row_stride + id];
-
-    val.x = inp1_reg.x + inp2_reg.x + inp3_reg.x;
-    val.y = inp1_reg.y + inp2_reg.y + inp3_reg.y;
-    val.z = inp1_reg.z + inp2_reg.z + inp3_reg.z;
-    val.w = inp1_reg.w + inp2_reg.w + inp3_reg.w;
-
-    out_4[row * row_stride + id] = val;
-}
-
-__global__ void fused_add3_kernel(__half* out,
-                                  const __half* inp1,
-                                  const __half* inp2,
-                                  const __half* inp3,
-                                  int size,
-                                  int row_stride)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    const float2* inp1_arr = reinterpret_cast<const float2*>(inp1);
-    const float2* inp2_arr = reinterpret_cast<const float2*>(inp2);
-    const float2* inp3_arr = reinterpret_cast<const float2*>(inp3);
-
-    float2 inp1_4 = inp1_arr[row * row_stride + id];
-    float2 inp2_4 = inp2_arr[row * row_stride + id];
-    float2 inp3_4 = inp3_arr[row * row_stride + id];
-
-    __half2* inp1_h = reinterpret_cast<__half2*>(&inp1_4);
-    __half2* inp2_h = reinterpret_cast<__half2*>(&inp2_4);
-    __half2* inp3_h = reinterpret_cast<__half2*>(&inp3_4);
-
-    float2 inp1_h_f_0 = __half22float2(inp1_h[0]);
-    float2 inp1_h_f_1 = __half22float2(inp1_h[1]);
-
-    float2 inp2_h_f_0 = __half22float2(inp2_h[0]);
-    float2 inp2_h_f_1 = __half22float2(inp2_h[1]);
-
-    float2 inp3_h_f_0 = __half22float2(inp3_h[0]);
-    float2 inp3_h_f_1 = __half22float2(inp3_h[1]);
-
-    inp1_h_f_0.x += (inp2_h_f_0.x + inp3_h_f_0.x);
-    inp1_h_f_0.y += (inp2_h_f_0.y + inp3_h_f_0.y);
-    inp1_h_f_1.x += (inp2_h_f_1.x + inp3_h_f_1.x);
-    inp1_h_f_1.y += (inp2_h_f_1.y + inp3_h_f_1.y);
-
-    float2 val_f;
-    __half2* val_h = reinterpret_cast<__half2*>(&val_f);
-
-    val_h[0] = __float22half2_rn(inp1_h_f_0);
-    val_h[1] = __float22half2_rn(inp1_h_f_1);
-
-    float2* out_4 = reinterpret_cast<float2*>(out);
-    out_4[row * row_stride + id] = val_f;
-}
-
-template <>
-void launch_fused_add3<float>(float* out,
-                              const float* inp1,
-                              const float* inp2,
-                              const float* inp3,
-                              int batch_size,
-                              int seq_length,
-                              int hidden_size,
-                              cudaStream_t& stream)
-{
-    dim3 grid_dim(batch_size * seq_length);
-
-    dim3 block_dim(hidden_size / 4);
-
-    fused_add3_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        out, inp1, inp2, inp3, (batch_size * seq_length * hidden_size), hidden_size / 4);
-}
-
-template <>
-void launch_fused_add3<__half>(__half* out,
-                               const __half* inp1,
-                               const __half* inp2,
-                               const __half* inp3,
-                               int batch_size,
-                               int seq_length,
-                               int hidden_size,
-                               cudaStream_t& stream)
-{
-    dim3 grid_dim(batch_size * seq_length);
-
-    dim3 block_dim(hidden_size / 4);
-
-    fused_add3_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        out, inp1, inp2, inp3, (batch_size * seq_length * hidden_size), hidden_size / 4);
-}
-
-__global__ void fused_add4_kernel(float* out,
-                                  const float* inp1,
-                                  const float* inp2,
-                                  const float* inp3,
-                                  const float* inp4,
-                                  int size,
-                                  int row_stride)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-
-    const float4* inp1_4 = reinterpret_cast<const float4*>(inp1);
-    const float4* inp2_4 = reinterpret_cast<const float4*>(inp2);
-    const float4* inp3_4 = reinterpret_cast<const float4*>(inp3);
-    const float4* inp4_4 = reinterpret_cast<const float4*>(inp4);
-    float4* out_4 = reinterpret_cast<float4*>(out);
-
-    float4 val;
-    float4 inp1_reg = inp1_4[row * row_stride + id];
-    float4 inp2_reg = inp2_4[row * row_stride + id];
-    float4 inp3_reg = inp3_4[row * row_stride + id];
-    float4 inp4_reg = inp4_4[row * row_stride + id];
-
-    val.x = inp1_reg.x + inp2_reg.x + inp3_reg.x + inp4_reg.x;
-    val.y = inp1_reg.y + inp2_reg.y + inp3_reg.y + inp4_reg.y;
-    val.z = inp1_reg.z + inp2_reg.z + inp3_reg.z + inp4_reg.z;
-    val.w = inp1_reg.w + inp2_reg.w + inp3_reg.w + inp4_reg.w;
-
-    out_4[row * row_stride + id] = val;
-}
-
-__global__ void fused_add4_kernel(__half* out,
-                                  const __half* inp1,
-                                  const __half* inp2,
-                                  const __half* inp3,
-                                  const __half* inp4,
-                                  int size,
-                                  int row_stride)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    const float2* inp1_arr = reinterpret_cast<const float2*>(inp1);
-    const float2* inp2_arr = reinterpret_cast<const float2*>(inp2);
-    const float2* inp3_arr = reinterpret_cast<const float2*>(inp3);
-    const float2* inp4_arr = reinterpret_cast<const float2*>(inp4);
-
-    float2 inp1_4 = inp1_arr[row * row_stride + id];
-    float2 inp2_4 = inp2_arr[row * row_stride + id];
-    float2 inp3_4 = inp3_arr[row * row_stride + id];
-    float2 inp4_4 = inp4_arr[row * row_stride + id];
-
-    __half2* inp1_h = reinterpret_cast<__half2*>(&inp1_4);
-    __half2* inp2_h = reinterpret_cast<__half2*>(&inp2_4);
-    __half2* inp3_h = reinterpret_cast<__half2*>(&inp3_4);
-    __half2* inp4_h = reinterpret_cast<__half2*>(&inp4_4);
-
-    float2 inp1_h_f_0 = __half22float2(inp1_h[0]);
-    float2 inp1_h_f_1 = __half22float2(inp1_h[1]);
-
-    float2 inp2_h_f_0 = __half22float2(inp2_h[0]);
-    float2 inp2_h_f_1 = __half22float2(inp2_h[1]);
-
-    float2 inp3_h_f_0 = __half22float2(inp3_h[0]);
-    float2 inp3_h_f_1 = __half22float2(inp3_h[1]);
-
-    float2 inp4_h_f_0 = __half22float2(inp4_h[0]);
-    float2 inp4_h_f_1 = __half22float2(inp4_h[1]);
-
-    inp1_h_f_0.x += (inp2_h_f_0.x + inp3_h_f_0.x + inp4_h_f_0.x);
-    inp1_h_f_0.y += (inp2_h_f_0.y + inp3_h_f_0.y + inp4_h_f_0.y);
-    inp1_h_f_1.x += (inp2_h_f_1.x + inp3_h_f_1.x + inp4_h_f_1.x);
-    inp1_h_f_1.y += (inp2_h_f_1.y + inp3_h_f_1.y + inp4_h_f_1.y);
-
-    float2 val_f;
-    __half2* val_h = reinterpret_cast<__half2*>(&val_f);
-
-    val_h[0] = __float22half2_rn(inp1_h_f_0);
-    val_h[1] = __float22half2_rn(inp1_h_f_1);
-
-    float2* out_4 = reinterpret_cast<float2*>(out);
-    out_4[row * row_stride + id] = val_f;
-}
-
-template <>
-void launch_fused_add4<float>(float* out,
-                              const float* inp1,
-                              const float* inp2,
-                              const float* inp3,
-                              const float* inp4,
-                              int batch_size,
-                              int seq_length,
-                              int hidden_size,
-                              cudaStream_t& stream)
-{
-    dim3 grid_dim(batch_size * seq_length);
-
-    dim3 block_dim(hidden_size / 4);
-
-    fused_add4_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        out, inp1, inp2, inp3, inp4, (batch_size * seq_length * hidden_size), hidden_size / 4);
-}
-
-template <>
-void launch_fused_add4<__half>(__half* out,
-                               const __half* inp1,
-                               const __half* inp2,
-                               const __half* inp3,
-                               const __half* inp4,
-                               int batch_size,
-                               int seq_length,
-                               int hidden_size,
-                               cudaStream_t& stream)
-{
-    dim3 grid_dim(batch_size * seq_length);
-
-    dim3 block_dim(hidden_size / 4);
-
-    fused_add4_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        out, inp1, inp2, inp3, inp4, (batch_size * seq_length * hidden_size), hidden_size / 4);
-}
diff --git a/deepspeed/ops/csrc/transformer_bak/general_kernels.hip b/deepspeed/ops/csrc/transformer_bak/general_kernels.hip
deleted file mode 100644
index 5be2fc240debf3dbddba72f0f9587331222910df..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/general_kernels.hip
+++ /dev/null
@@ -1,413 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include "general_kernels_hip.h"
-
-namespace cg = cooperative_groups;
-
-template <typename T>
-__global__ void column_sum_reduce(const T* __restrict__ inp,
-                                  T* __restrict__ out,
-                                  int rows,
-                                  int width)
-{
-    __shared__ float tile[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-
-    int y_stride = width * TILE_DIM;
-
-    float localSum = 0;
-
-    // Loop across matrix height
-    if (idx < width) {
-        int offset = threadIdx.y * width + idx;
-        for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-            localSum += (float)inp[offset];
-            offset += y_stride;
-        }
-    }
-
-    tile[threadIdx.x][threadIdx.y] = localSum;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float sum = tile[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) sum += g.shfl_down(sum, i);
-
-    if (threadIdx.x == 0) {
-        int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-        if (pos < width) out[pos] = sum;
-    }
-}
-
-template <typename T>
-void launch_fuse_transpose_bias_kernel(const T* inp,
-                                       T* out,
-                                       int rows,
-                                       int cols,
-                                       hipStream_t stream);
-
-template <>
-void launch_fuse_transpose_bias_kernel<float>(const float* inp,
-                                              float* out,
-                                              int rows,
-                                              int cols,
-                                              hipStream_t stream)
-{
-    // assert(rows % TILE_DIM == 0);
-    // assert(cols % TILE_DIM == 0);
-
-    dim3 grid_dim((cols - 1) / TILE_DIM + 1);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-   hipLaunchKernelGGL(( column_sum_reduce<float>), dim3(grid_dim), dim3(block_dim), 0, stream, inp, out, rows, cols);
-}
-
-template <>
-void launch_fuse_transpose_bias_kernel<__half>(const __half* inp,
-                                               __half* out,
-                                               int rows,
-                                               int cols,
-                                               hipStream_t stream)
-{
-    // assert(rows % TILE_DIM == 0);
-    // assert(cols % TILE_DIM == 0);
-
-    dim3 grid_dim((cols - 1) / TILE_DIM + 1);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-   hipLaunchKernelGGL(( column_sum_reduce<__half>), dim3(grid_dim), dim3(block_dim), 0, stream, inp, out, rows, cols);
-}
-
-__global__ void fused_add2_kernel(const int N, float* out, const float* inp1, const float* inp2)
-{
-    const float4* inp1_4 = reinterpret_cast<const float4*>(inp1);
-    const float4* inp2_4 = reinterpret_cast<const float4*>(inp2);
-    float4* out_4 = reinterpret_cast<float4*>(out);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 val;
-        float4 inp1_reg = inp1_4[j];
-        float4 inp2_reg = inp2_4[j];
-
-        val.x = inp1_reg.x + inp2_reg.x;
-        val.y = inp1_reg.y + inp2_reg.y;
-        val.z = inp1_reg.z + inp2_reg.z;
-        val.w = inp1_reg.w + inp2_reg.w;
-
-        out_4[j] = val;
-    }
-}
-
-__global__ void fused_add2_kernel(const int N, __half* out, const __half* inp1, const __half* inp2)
-{
-    float2 inp1_4;
-    float2 inp2_4;
-
-    __half2* inp1_h = reinterpret_cast<__half2*>(&inp1_4);
-    __half2* inp2_h = reinterpret_cast<__half2*>(&inp2_4);
-
-    const float2* inp1_arr = reinterpret_cast<const float2*>(inp1);
-    const float2* inp2_arr = reinterpret_cast<const float2*>(inp2);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        inp1_4 = inp1_arr[j];
-        inp2_4 = inp2_arr[j];
-
-        float2 inp1_h_f_0 = __half22float2(inp1_h[0]);
-        float2 inp1_h_f_1 = __half22float2(inp1_h[1]);
-
-        float2 inp2_h_f_0 = __half22float2(inp2_h[0]);
-        float2 inp2_h_f_1 = __half22float2(inp2_h[1]);
-
-        inp1_h_f_0.x += inp2_h_f_0.x;
-        inp1_h_f_0.y += inp2_h_f_0.y;
-        inp1_h_f_1.x += inp2_h_f_1.x;
-        inp1_h_f_1.y += inp2_h_f_1.y;
-
-        float2 val_f;
-        __half2* val_h = reinterpret_cast<__half2*>(&val_f);
-
-        val_h[0] = __float22half2_rn(inp1_h_f_0);
-        val_h[1] = __float22half2_rn(inp1_h_f_1);
-
-        float2* out_4 = reinterpret_cast<float2*>(out);
-        out_4[j] = val_f;
-    }
-}
-
-template <>
-void launch_fused_add2<float>(float* out,
-                              const float* inp1,
-                              const float* inp2,
-                              int batch_size,
-                              int seq_length,
-                              int hidden_dim,
-                              hipStream_t& stream)
-{
-    int total_count = batch_size * seq_length * hidden_dim / 4;
-    dim3 grid_dim = DS_GET_BLOCKS(total_count);  //(batch_size * seq_length);
-
-    dim3 block_dim = DS_CUDA_NUM_THREADS;  //(hidden_dim / 4);
-
-   hipLaunchKernelGGL(( fused_add2_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, total_count, out, inp1, inp2);
-}
-
-template <>
-void launch_fused_add2<__half>(__half* out,
-                               const __half* inp1,
-                               const __half* inp2,
-                               int batch_size,
-                               int seq_length,
-                               int hidden_dim,
-                               hipStream_t& stream)
-{
-    int total_count = batch_size * seq_length * hidden_dim / 4;
-    dim3 grid_dim = DS_GET_BLOCKS(total_count);  //(batch_size * seq_length);
-
-    dim3 block_dim = DS_CUDA_NUM_THREADS;  //(hidden_dim / 4);
-
-   hipLaunchKernelGGL(( fused_add2_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, total_count, out, inp1, inp2);
-}
-
-__global__ void fused_add3_kernel(float* out,
-                                  const float* inp1,
-                                  const float* inp2,
-                                  const float* inp3,
-                                  int size,
-                                  int row_stride)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-
-    const float4* inp1_4 = reinterpret_cast<const float4*>(inp1);
-    const float4* inp2_4 = reinterpret_cast<const float4*>(inp2);
-    const float4* inp3_4 = reinterpret_cast<const float4*>(inp3);
-
-    float4* out_4 = reinterpret_cast<float4*>(out);
-
-    float4 val;
-    float4 inp1_reg = inp1_4[row * row_stride + id];
-    float4 inp2_reg = inp2_4[row * row_stride + id];
-    float4 inp3_reg = inp3_4[row * row_stride + id];
-
-    val.x = inp1_reg.x + inp2_reg.x + inp3_reg.x;
-    val.y = inp1_reg.y + inp2_reg.y + inp3_reg.y;
-    val.z = inp1_reg.z + inp2_reg.z + inp3_reg.z;
-    val.w = inp1_reg.w + inp2_reg.w + inp3_reg.w;
-
-    out_4[row * row_stride + id] = val;
-}
-
-__global__ void fused_add3_kernel(__half* out,
-                                  const __half* inp1,
-                                  const __half* inp2,
-                                  const __half* inp3,
-                                  int size,
-                                  int row_stride)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    const float2* inp1_arr = reinterpret_cast<const float2*>(inp1);
-    const float2* inp2_arr = reinterpret_cast<const float2*>(inp2);
-    const float2* inp3_arr = reinterpret_cast<const float2*>(inp3);
-
-    float2 inp1_4 = inp1_arr[row * row_stride + id];
-    float2 inp2_4 = inp2_arr[row * row_stride + id];
-    float2 inp3_4 = inp3_arr[row * row_stride + id];
-
-    __half2* inp1_h = reinterpret_cast<__half2*>(&inp1_4);
-    __half2* inp2_h = reinterpret_cast<__half2*>(&inp2_4);
-    __half2* inp3_h = reinterpret_cast<__half2*>(&inp3_4);
-
-    float2 inp1_h_f_0 = __half22float2(inp1_h[0]);
-    float2 inp1_h_f_1 = __half22float2(inp1_h[1]);
-
-    float2 inp2_h_f_0 = __half22float2(inp2_h[0]);
-    float2 inp2_h_f_1 = __half22float2(inp2_h[1]);
-
-    float2 inp3_h_f_0 = __half22float2(inp3_h[0]);
-    float2 inp3_h_f_1 = __half22float2(inp3_h[1]);
-
-    inp1_h_f_0.x += (inp2_h_f_0.x + inp3_h_f_0.x);
-    inp1_h_f_0.y += (inp2_h_f_0.y + inp3_h_f_0.y);
-    inp1_h_f_1.x += (inp2_h_f_1.x + inp3_h_f_1.x);
-    inp1_h_f_1.y += (inp2_h_f_1.y + inp3_h_f_1.y);
-
-    float2 val_f;
-    __half2* val_h = reinterpret_cast<__half2*>(&val_f);
-
-    val_h[0] = __float22half2_rn(inp1_h_f_0);
-    val_h[1] = __float22half2_rn(inp1_h_f_1);
-
-    float2* out_4 = reinterpret_cast<float2*>(out);
-    out_4[row * row_stride + id] = val_f;
-}
-
-template <>
-void launch_fused_add3<float>(float* out,
-                              const float* inp1,
-                              const float* inp2,
-                              const float* inp3,
-                              int batch_size,
-                              int seq_length,
-                              int hidden_size,
-                              hipStream_t& stream)
-{
-    dim3 grid_dim(batch_size * seq_length);
-
-    dim3 block_dim(hidden_size / 4);
-
-   hipLaunchKernelGGL(( fused_add3_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        out, inp1, inp2, inp3, (batch_size * seq_length * hidden_size), hidden_size / 4);
-}
-
-template <>
-void launch_fused_add3<__half>(__half* out,
-                               const __half* inp1,
-                               const __half* inp2,
-                               const __half* inp3,
-                               int batch_size,
-                               int seq_length,
-                               int hidden_size,
-                               hipStream_t& stream)
-{
-    dim3 grid_dim(batch_size * seq_length);
-
-    dim3 block_dim(hidden_size / 4);
-
-   hipLaunchKernelGGL(( fused_add3_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        out, inp1, inp2, inp3, (batch_size * seq_length * hidden_size), hidden_size / 4);
-}
-
-__global__ void fused_add4_kernel(float* out,
-                                  const float* inp1,
-                                  const float* inp2,
-                                  const float* inp3,
-                                  const float* inp4,
-                                  int size,
-                                  int row_stride)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-
-    const float4* inp1_4 = reinterpret_cast<const float4*>(inp1);
-    const float4* inp2_4 = reinterpret_cast<const float4*>(inp2);
-    const float4* inp3_4 = reinterpret_cast<const float4*>(inp3);
-    const float4* inp4_4 = reinterpret_cast<const float4*>(inp4);
-    float4* out_4 = reinterpret_cast<float4*>(out);
-
-    float4 val;
-    float4 inp1_reg = inp1_4[row * row_stride + id];
-    float4 inp2_reg = inp2_4[row * row_stride + id];
-    float4 inp3_reg = inp3_4[row * row_stride + id];
-    float4 inp4_reg = inp4_4[row * row_stride + id];
-
-    val.x = inp1_reg.x + inp2_reg.x + inp3_reg.x + inp4_reg.x;
-    val.y = inp1_reg.y + inp2_reg.y + inp3_reg.y + inp4_reg.y;
-    val.z = inp1_reg.z + inp2_reg.z + inp3_reg.z + inp4_reg.z;
-    val.w = inp1_reg.w + inp2_reg.w + inp3_reg.w + inp4_reg.w;
-
-    out_4[row * row_stride + id] = val;
-}
-
-__global__ void fused_add4_kernel(__half* out,
-                                  const __half* inp1,
-                                  const __half* inp2,
-                                  const __half* inp3,
-                                  const __half* inp4,
-                                  int size,
-                                  int row_stride)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    const float2* inp1_arr = reinterpret_cast<const float2*>(inp1);
-    const float2* inp2_arr = reinterpret_cast<const float2*>(inp2);
-    const float2* inp3_arr = reinterpret_cast<const float2*>(inp3);
-    const float2* inp4_arr = reinterpret_cast<const float2*>(inp4);
-
-    float2 inp1_4 = inp1_arr[row * row_stride + id];
-    float2 inp2_4 = inp2_arr[row * row_stride + id];
-    float2 inp3_4 = inp3_arr[row * row_stride + id];
-    float2 inp4_4 = inp4_arr[row * row_stride + id];
-
-    __half2* inp1_h = reinterpret_cast<__half2*>(&inp1_4);
-    __half2* inp2_h = reinterpret_cast<__half2*>(&inp2_4);
-    __half2* inp3_h = reinterpret_cast<__half2*>(&inp3_4);
-    __half2* inp4_h = reinterpret_cast<__half2*>(&inp4_4);
-
-    float2 inp1_h_f_0 = __half22float2(inp1_h[0]);
-    float2 inp1_h_f_1 = __half22float2(inp1_h[1]);
-
-    float2 inp2_h_f_0 = __half22float2(inp2_h[0]);
-    float2 inp2_h_f_1 = __half22float2(inp2_h[1]);
-
-    float2 inp3_h_f_0 = __half22float2(inp3_h[0]);
-    float2 inp3_h_f_1 = __half22float2(inp3_h[1]);
-
-    float2 inp4_h_f_0 = __half22float2(inp4_h[0]);
-    float2 inp4_h_f_1 = __half22float2(inp4_h[1]);
-
-    inp1_h_f_0.x += (inp2_h_f_0.x + inp3_h_f_0.x + inp4_h_f_0.x);
-    inp1_h_f_0.y += (inp2_h_f_0.y + inp3_h_f_0.y + inp4_h_f_0.y);
-    inp1_h_f_1.x += (inp2_h_f_1.x + inp3_h_f_1.x + inp4_h_f_1.x);
-    inp1_h_f_1.y += (inp2_h_f_1.y + inp3_h_f_1.y + inp4_h_f_1.y);
-
-    float2 val_f;
-    __half2* val_h = reinterpret_cast<__half2*>(&val_f);
-
-    val_h[0] = __float22half2_rn(inp1_h_f_0);
-    val_h[1] = __float22half2_rn(inp1_h_f_1);
-
-    float2* out_4 = reinterpret_cast<float2*>(out);
-    out_4[row * row_stride + id] = val_f;
-}
-
-template <>
-void launch_fused_add4<float>(float* out,
-                              const float* inp1,
-                              const float* inp2,
-                              const float* inp3,
-                              const float* inp4,
-                              int batch_size,
-                              int seq_length,
-                              int hidden_size,
-                              hipStream_t& stream)
-{
-    dim3 grid_dim(batch_size * seq_length);
-
-    dim3 block_dim(hidden_size / 4);
-
-   hipLaunchKernelGGL(( fused_add4_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        out, inp1, inp2, inp3, inp4, (batch_size * seq_length * hidden_size), hidden_size / 4);
-}
-
-template <>
-void launch_fused_add4<__half>(__half* out,
-                               const __half* inp1,
-                               const __half* inp2,
-                               const __half* inp3,
-                               const __half* inp4,
-                               int batch_size,
-                               int seq_length,
-                               int hidden_size,
-                               hipStream_t& stream)
-{
-    dim3 grid_dim(batch_size * seq_length);
-
-    dim3 block_dim(hidden_size / 4);
-
-   hipLaunchKernelGGL(( fused_add4_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        out, inp1, inp2, inp3, inp4, (batch_size * seq_length * hidden_size), hidden_size / 4);
-}
diff --git a/deepspeed/ops/csrc/transformer_bak/inference/csrc/apply_rotary_pos_emb.cu b/deepspeed/ops/csrc/transformer_bak/inference/csrc/apply_rotary_pos_emb.cu
deleted file mode 100644
index 524a63a83daeb97731d0685359b7be8412712d8e..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/inference/csrc/apply_rotary_pos_emb.cu
+++ /dev/null
@@ -1,372 +0,0 @@
-#include "custom_cuda_layers.h"
-
-//#include <cuda_profiler_api.h>
-
-namespace cg = cooperative_groups;
-
-__global__ void apply_rotary_pos_emb(float* mixed_query,
-                                     float* key_layer,
-                                     unsigned rotary_dim,
-                                     unsigned seq_len,
-                                     unsigned seq_offset,
-                                     unsigned num_heads,
-                                     unsigned head_size,
-                                     unsigned total_count)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned offset = head_id * head_size;
-
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = mixed_query[offset + lane];
-            float k = key_layer[offset + lane];
-            float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            q_rot = g.shfl_xor(q_rot, 1);
-            k_rot = g.shfl_xor(k_rot, 1);
-            q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
-
-            mixed_query[offset + lane] = q;
-            key_layer[offset + lane] = k;
-
-            lane += WARP_SIZE;
-        }
-    }
-}
-
-__global__ void apply_rotary_pos_emb(__half* mixed_query,
-                                     __half* key_layer,
-                                     unsigned rotary_dim,
-                                     unsigned seq_len,
-                                     unsigned seq_offset,
-                                     unsigned num_heads,
-                                     unsigned head_size,
-                                     unsigned total_count)
-{
-#if __CUDA_ARCH__ >= 700
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned offset = head_id * head_size;
-
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = (float)mixed_query[offset + lane];
-            float k = (float)key_layer[offset + lane];
-            float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            q_rot = g.shfl_xor(q_rot, 1);
-            k_rot = g.shfl_xor(k_rot, 1);
-            q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
-
-            mixed_query[offset + lane] = (__half)q;
-            key_layer[offset + lane] = (__half)k;
-
-            lane += WARP_SIZE;
-        }
-    }
-#endif
-}
-__global__ void apply_rotary_pos_emb1(float* mixed_query,
-                                      float* key_layer,
-                                      unsigned rotary_dim,
-                                      unsigned seq_len,
-                                      unsigned seq_offset,
-                                      unsigned num_heads,
-                                      unsigned head_size,
-                                      unsigned total_count)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned offset = head_id * head_size;
-
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = mixed_query[offset + lane];
-            float k = key_layer[offset + lane];
-            float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            q_rot = g.shfl_xor(q_rot, 1);
-            k_rot = g.shfl_xor(k_rot, 1);
-            q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
-
-            mixed_query[offset + lane] = q;
-            key_layer[offset + lane] = k;
-
-            lane += WARP_SIZE;
-        }
-    }
-}
-__global__ void apply_rotary_pos_emb1(__half* mixed_query,
-                                      __half* key_layer,
-                                      unsigned rotary_dim,
-                                      unsigned seq_len,
-                                      unsigned seq_offset,
-                                      unsigned num_heads,
-                                      unsigned head_size,
-                                      unsigned total_count)
-{
-#if __CUDA_ARCH__ >= 700
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned offset = head_id * head_size;
-
-    constexpr unsigned mask[32] = {
-        0x1 | 0x1000,     0x2 | 0x2000,     0x4 | 0x4000,     0x8 | 0x8000,     0x10 | 0x10000,
-        0x20 | 0x20000,   0x40 | 0x40000,   0x80 | 0x80000,   0x100 | 0x100000, 0x200 | 0x200000,
-        0x400 | 0x400000, 0x800 | 0x800000, 0x1000 | 0x1,     0x2000 | 0x2,     0x4000 | 0x4,
-        0x8000 | 0x8,     0x10000 | 0x10,   0x20000 | 0x20,   0x40000 | 0x40,   0x80000 | 0x80,
-        0x100000 | 0x100, 0x200000 | 0x200, 0x400000 | 0x400, 0x800000 | 0x800, 0x1000000,
-        0x2000000,        0x4000000,        0x8000000,        0x10000000,       0x20000000,
-        0x40000000,       0x80000000};
-
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-    unsigned half_dim = rotary_dim >> 1;
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane % half_dim) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = (float)mixed_query[offset + lane];
-            float k = (float)key_layer[offset + lane];
-            float rotary_sign = (lane > (half_dim - 1) ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            auto q_rot_tmp = lane < half_dim ? __shfl_sync(mask[lane], q_rot, lane + half_dim)
-                                             : __shfl_sync(mask[lane], q_rot, lane - half_dim);
-            auto k_rot_tmp = lane < half_dim ? __shfl_sync(mask[lane], k_rot, lane + half_dim)
-                                             : __shfl_sync(mask[lane], k_rot, lane - half_dim);
-            q = q * cosf(inv_freq) + q_rot_tmp * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot_tmp * sinf(inv_freq);
-
-            mixed_query[offset + lane] = (__half)q;
-            key_layer[offset + lane] = (__half)k;
-
-            lane += WARP_SIZE;
-        }
-    }
-#endif
-}
-
-template <typename T>
-void launch_apply_rotary_pos_emb(T* mixed_query,
-                                 T* key_layer,
-                                 unsigned head_size,
-                                 unsigned seq_len,
-                                 unsigned rotary_dim,
-                                 unsigned offset,
-                                 unsigned num_heads,
-                                 unsigned batch,
-                                 bool rotate_half,
-                                 bool rotate_every_two,
-                                 cudaStream_t stream)
-{
-    int total_count = batch * num_heads * seq_len;
-    dim3 block_dims(1024);
-    dim3 grid_dims((total_count - 1) / MAX_WARP_NUM + 1);  // (batch_size);
-    if (rotate_every_two)
-        apply_rotary_pos_emb<<<grid_dims, block_dims, 0, stream>>>(
-            mixed_query, key_layer, rotary_dim, seq_len, offset, num_heads, head_size, total_count);
-    else if (rotate_half)
-        apply_rotary_pos_emb1<<<grid_dims, block_dims, 0, stream>>>(
-            mixed_query, key_layer, rotary_dim, seq_len, offset, num_heads, head_size, total_count);
-}
-
-template void launch_apply_rotary_pos_emb<float>(float*,
-                                                 float*,
-                                                 unsigned,
-                                                 unsigned,
-                                                 unsigned,
-                                                 unsigned,
-                                                 unsigned,
-                                                 unsigned,
-                                                 bool,
-                                                 bool,
-                                                 cudaStream_t);
-template void launch_apply_rotary_pos_emb<__half>(__half*,
-                                                  __half*,
-                                                  unsigned,
-                                                  unsigned,
-                                                  unsigned,
-                                                  unsigned,
-                                                  unsigned,
-                                                  unsigned,
-                                                  bool,
-                                                  bool,
-                                                  cudaStream_t);
-/*
-__global__ void apply_rotary_pos_emb(float* mixed_query,
-float* key_layer,
-unsigned rotary_dim,
-unsigned seq_len,
-unsigned seq_offset,
-unsigned num_heads,
-unsigned head_size,
-unsigned total_count)
-{
-cg::thread_block b = cg::this_thread_block();
-cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-int id = threadIdx.x;
-int gid = id >> 5;
-int lane = id & 0x1f;
-
-unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-unsigned offset = head_id * head_size;
-
-unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-if (head_id < total_count) {
-while (lane < rotary_dim) {
-float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-float q = mixed_query[offset + lane];
-float k = key_layer[offset + lane];
-float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-float q_rot = (q * rotary_sign);
-float k_rot = (k * rotary_sign);
-q_rot = g.shfl_xor(q_rot, 1);
-k_rot = g.shfl_xor(k_rot, 1);
-q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
-
-mixed_query[offset + lane] = q;
-key_layer[offset + lane] = k;
-
-lane += WARP_SIZE;
-}
-}
-}
-
-__global__ void apply_rotary_pos_emb(__half* mixed_query,
-__half* key_layer,
-unsigned rotary_dim,
-unsigned seq_len,
-unsigned seq_offset,
-unsigned num_heads,
-unsigned head_size,
-unsigned total_count)
-{
-#if __CUDA_ARCH__ >= 700
-cg::thread_block b = cg::this_thread_block();
-cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-int id = threadIdx.x;
-int gid = id >> 5;
-int lane = id & 0x1f;
-
-unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-unsigned offset = head_id * head_size;
-constexpr unsigned mask[32] = {0x1 | 0x1000, 0x2 | 0x2000, 0x4 | 0x4000, 0x8 | 0x8000,
-0x10 | 0x10000, 0x20 | 0x20000, 0x40 | 0x40000, 0x80 | 0x80000,
-0x100 | 0x100000, 0x200 | 0x200000, 0x400 | 0x400000, 0x800 | 0x800000,
-0x1000 | 0x1, 0x2000 | 0x2, 0x4000 | 0x4, 0x8000 | 0x8,
-0x10000 | 0x10, 0x20000 | 0x20, 0x40000 | 0x40, 0x80000 | 0x80,
-0x100000 | 0x100, 0x200000 | 0x200, 0x400000 | 0x400, 0x800000 | 0x800,
-0x1000000, 0x2000000, 0x4000000, 0x8000000,
-0x10000000, 0x20000000, 0x40000000, 0x80000000};
-unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-if (head_id < total_count) {
-while (lane < rotary_dim) {
-//float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-float inv_freq = (float)((lane % (rotary_dim >> 1)) * 2) / (float)rotary_dim;
-inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-float q = (float)mixed_query[offset + lane];
-float k = (float)key_layer[offset + lane];
-float rotary_sign = (lane > 11 ? -1.0 : 1.0);
-float q_rot = (q * rotary_sign);
-float k_rot = (k * rotary_sign);
-auto q_rot_tmp = lane < 12 ? __shfl_sync(mask[lane], q_rot, lane + 12) : __shfl_sync(mask[lane],
-q_rot, lane - 12);//g.shfl_xor(q_rot, 12); auto k_rot_tmp = lane < 12 ? __shfl_sync(mask[lane],
-k_rot, lane + 12) : __shfl_sync(mask[lane], k_rot, lane - 12);//g.shfl_xor(k_rot, 12); q = q *
-cosf(inv_freq) + q_rot_tmp * sinf(inv_freq); k = k * cosf(inv_freq) + k_rot_tmp * sinf(inv_freq);
-
-mixed_query[offset + lane] = (__half)q;
-key_layer[offset + lane] = (__half)k;
-
-lane += WARP_SIZE;
-}
-}
-#endif
-}
-
-template <typename T>
-void launch_apply_rotary_pos_emb(T* mixed_query,
-T* key_layer,
-unsigned head_size,
-unsigned seq_len,
-unsigned rotary_dim,
-unsigned offset,
-unsigned num_heads,
-unsigned batch,
-cudaStream_t stream)
-{
-int total_count = batch * num_heads * seq_len;
-dim3 block_dims(1024);
-dim3 grid_dims((total_count - 1) / MAX_WARP_NUM + 1);  // (batch_size);
-
-apply_rotary_pos_emb<<<grid_dims, block_dims, 0, stream>>>(
-mixed_query, key_layer, rotary_dim, seq_len, offset, num_heads, head_size, total_count);
-}
-
-template void launch_apply_rotary_pos_emb<float>(float*,
-float*,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-cudaStream_t);
-template void launch_apply_rotary_pos_emb<__half>(__half*,
-__half*,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-cudaStream_t);
-*/
diff --git a/deepspeed/ops/csrc/transformer_bak/inference/csrc/apply_rotary_pos_emb.hip b/deepspeed/ops/csrc/transformer_bak/inference/csrc/apply_rotary_pos_emb.hip
deleted file mode 100644
index 4d70a0a80a6d831ea419624eda6afd9f186d4501..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/inference/csrc/apply_rotary_pos_emb.hip
+++ /dev/null
@@ -1,374 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-
-//#include <cuda_profiler_api.h>
-
-namespace cg = cooperative_groups;
-
-__global__ void apply_rotary_pos_emb(float* mixed_query,
-                                     float* key_layer,
-                                     unsigned rotary_dim,
-                                     unsigned seq_len,
-                                     unsigned seq_offset,
-                                     unsigned num_heads,
-                                     unsigned head_size,
-                                     unsigned total_count)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned offset = head_id * head_size;
-
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = mixed_query[offset + lane];
-            float k = key_layer[offset + lane];
-            float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            q_rot = g.shfl_xor(q_rot, 1);
-            k_rot = g.shfl_xor(k_rot, 1);
-            q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
-
-            mixed_query[offset + lane] = q;
-            key_layer[offset + lane] = k;
-
-            lane += WARP_SIZE;
-        }
-    }
-}
-
-__global__ void apply_rotary_pos_emb(__half* mixed_query,
-                                     __half* key_layer,
-                                     unsigned rotary_dim,
-                                     unsigned seq_len,
-                                     unsigned seq_offset,
-                                     unsigned num_heads,
-                                     unsigned head_size,
-                                     unsigned total_count)
-{
-#if __CUDA_ARCH__ >= 700
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned offset = head_id * head_size;
-
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = (float)mixed_query[offset + lane];
-            float k = (float)key_layer[offset + lane];
-            float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            q_rot = g.shfl_xor(q_rot, 1);
-            k_rot = g.shfl_xor(k_rot, 1);
-            q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
-
-            mixed_query[offset + lane] = (__half)q;
-            key_layer[offset + lane] = (__half)k;
-
-            lane += WARP_SIZE;
-        }
-    }
-#endif
-}
-__global__ void apply_rotary_pos_emb1(float* mixed_query,
-                                      float* key_layer,
-                                      unsigned rotary_dim,
-                                      unsigned seq_len,
-                                      unsigned seq_offset,
-                                      unsigned num_heads,
-                                      unsigned head_size,
-                                      unsigned total_count)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned offset = head_id * head_size;
-
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = mixed_query[offset + lane];
-            float k = key_layer[offset + lane];
-            float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            q_rot = g.shfl_xor(q_rot, 1);
-            k_rot = g.shfl_xor(k_rot, 1);
-            q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
-
-            mixed_query[offset + lane] = q;
-            key_layer[offset + lane] = k;
-
-            lane += WARP_SIZE;
-        }
-    }
-}
-__global__ void apply_rotary_pos_emb1(__half* mixed_query,
-                                      __half* key_layer,
-                                      unsigned rotary_dim,
-                                      unsigned seq_len,
-                                      unsigned seq_offset,
-                                      unsigned num_heads,
-                                      unsigned head_size,
-                                      unsigned total_count)
-{
-#if __CUDA_ARCH__ >= 700
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned offset = head_id * head_size;
-
-    constexpr unsigned mask[32] = {
-        0x1 | 0x1000,     0x2 | 0x2000,     0x4 | 0x4000,     0x8 | 0x8000,     0x10 | 0x10000,
-        0x20 | 0x20000,   0x40 | 0x40000,   0x80 | 0x80000,   0x100 | 0x100000, 0x200 | 0x200000,
-        0x400 | 0x400000, 0x800 | 0x800000, 0x1000 | 0x1,     0x2000 | 0x2,     0x4000 | 0x4,
-        0x8000 | 0x8,     0x10000 | 0x10,   0x20000 | 0x20,   0x40000 | 0x40,   0x80000 | 0x80,
-        0x100000 | 0x100, 0x200000 | 0x200, 0x400000 | 0x400, 0x800000 | 0x800, 0x1000000,
-        0x2000000,        0x4000000,        0x8000000,        0x10000000,       0x20000000,
-        0x40000000,       0x80000000};
-
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-    unsigned half_dim = rotary_dim >> 1;
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane % half_dim) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = (float)mixed_query[offset + lane];
-            float k = (float)key_layer[offset + lane];
-            float rotary_sign = (lane > (half_dim - 1) ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            auto q_rot_tmp = lane < half_dim ? __shfl_sync(mask[lane], q_rot, lane + half_dim)
-                                             : __shfl_sync(mask[lane], q_rot, lane - half_dim);
-            auto k_rot_tmp = lane < half_dim ? __shfl_sync(mask[lane], k_rot, lane + half_dim)
-                                             : __shfl_sync(mask[lane], k_rot, lane - half_dim);
-            q = q * cosf(inv_freq) + q_rot_tmp * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot_tmp * sinf(inv_freq);
-
-            mixed_query[offset + lane] = (__half)q;
-            key_layer[offset + lane] = (__half)k;
-
-            lane += WARP_SIZE;
-        }
-    }
-#endif
-}
-
-template <typename T>
-void launch_apply_rotary_pos_emb(T* mixed_query,
-                                 T* key_layer,
-                                 unsigned head_size,
-                                 unsigned seq_len,
-                                 unsigned rotary_dim,
-                                 unsigned offset,
-                                 unsigned num_heads,
-                                 unsigned batch,
-                                 bool rotate_half,
-                                 bool rotate_every_two,
-                                 hipStream_t stream)
-{
-    int total_count = batch * num_heads * seq_len;
-    dim3 block_dims(1024);
-    dim3 grid_dims((total_count - 1) / MAX_WARP_NUM + 1);  // (batch_size);
-    if (rotate_every_two)
-       hipLaunchKernelGGL(( apply_rotary_pos_emb), dim3(grid_dims), dim3(block_dims), 0, stream, 
-            mixed_query, key_layer, rotary_dim, seq_len, offset, num_heads, head_size, total_count);
-    else if (rotate_half)
-       hipLaunchKernelGGL(( apply_rotary_pos_emb1), dim3(grid_dims), dim3(block_dims), 0, stream, 
-            mixed_query, key_layer, rotary_dim, seq_len, offset, num_heads, head_size, total_count);
-}
-
-template void launch_apply_rotary_pos_emb<float>(float*,
-                                                 float*,
-                                                 unsigned,
-                                                 unsigned,
-                                                 unsigned,
-                                                 unsigned,
-                                                 unsigned,
-                                                 unsigned,
-                                                 bool,
-                                                 bool,
-                                                 hipStream_t);
-template void launch_apply_rotary_pos_emb<__half>(__half*,
-                                                  __half*,
-                                                  unsigned,
-                                                  unsigned,
-                                                  unsigned,
-                                                  unsigned,
-                                                  unsigned,
-                                                  unsigned,
-                                                  bool,
-                                                  bool,
-                                                  hipStream_t);
-/*
-__global__ void apply_rotary_pos_emb(float* mixed_query,
-float* key_layer,
-unsigned rotary_dim,
-unsigned seq_len,
-unsigned seq_offset,
-unsigned num_heads,
-unsigned head_size,
-unsigned total_count)
-{
-cg::thread_block b = cg::this_thread_block();
-cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-int id = threadIdx.x;
-int gid = id >> 5;
-int lane = id & 0x1f;
-
-unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-unsigned offset = head_id * head_size;
-
-unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-if (head_id < total_count) {
-while (lane < rotary_dim) {
-float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-float q = mixed_query[offset + lane];
-float k = key_layer[offset + lane];
-float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-float q_rot = (q * rotary_sign);
-float k_rot = (k * rotary_sign);
-q_rot = g.shfl_xor(q_rot, 1);
-k_rot = g.shfl_xor(k_rot, 1);
-q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
-
-mixed_query[offset + lane] = q;
-key_layer[offset + lane] = k;
-
-lane += WARP_SIZE;
-}
-}
-}
-
-__global__ void apply_rotary_pos_emb(__half* mixed_query,
-__half* key_layer,
-unsigned rotary_dim,
-unsigned seq_len,
-unsigned seq_offset,
-unsigned num_heads,
-unsigned head_size,
-unsigned total_count)
-{
-#if __CUDA_ARCH__ >= 700
-cg::thread_block b = cg::this_thread_block();
-cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-int id = threadIdx.x;
-int gid = id >> 5;
-int lane = id & 0x1f;
-
-unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-unsigned offset = head_id * head_size;
-constexpr unsigned mask[32] = {0x1 | 0x1000, 0x2 | 0x2000, 0x4 | 0x4000, 0x8 | 0x8000,
-0x10 | 0x10000, 0x20 | 0x20000, 0x40 | 0x40000, 0x80 | 0x80000,
-0x100 | 0x100000, 0x200 | 0x200000, 0x400 | 0x400000, 0x800 | 0x800000,
-0x1000 | 0x1, 0x2000 | 0x2, 0x4000 | 0x4, 0x8000 | 0x8,
-0x10000 | 0x10, 0x20000 | 0x20, 0x40000 | 0x40, 0x80000 | 0x80,
-0x100000 | 0x100, 0x200000 | 0x200, 0x400000 | 0x400, 0x800000 | 0x800,
-0x1000000, 0x2000000, 0x4000000, 0x8000000,
-0x10000000, 0x20000000, 0x40000000, 0x80000000};
-unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-if (head_id < total_count) {
-while (lane < rotary_dim) {
-//float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-float inv_freq = (float)((lane % (rotary_dim >> 1)) * 2) / (float)rotary_dim;
-inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-float q = (float)mixed_query[offset + lane];
-float k = (float)key_layer[offset + lane];
-float rotary_sign = (lane > 11 ? -1.0 : 1.0);
-float q_rot = (q * rotary_sign);
-float k_rot = (k * rotary_sign);
-auto q_rot_tmp = lane < 12 ? __shfl_sync(mask[lane], q_rot, lane + 12) : __shfl_sync(mask[lane],
-q_rot, lane - 12);//g.shfl_xor(q_rot, 12); auto k_rot_tmp = lane < 12 ? __shfl_sync(mask[lane],
-k_rot, lane + 12) : __shfl_sync(mask[lane], k_rot, lane - 12);//g.shfl_xor(k_rot, 12); q = q *
-cosf(inv_freq) + q_rot_tmp * sinf(inv_freq); k = k * cosf(inv_freq) + k_rot_tmp * sinf(inv_freq);
-
-mixed_query[offset + lane] = (__half)q;
-key_layer[offset + lane] = (__half)k;
-
-lane += WARP_SIZE;
-}
-}
-#endif
-}
-
-template <typename T>
-void launch_apply_rotary_pos_emb(T* mixed_query,
-T* key_layer,
-unsigned head_size,
-unsigned seq_len,
-unsigned rotary_dim,
-unsigned offset,
-unsigned num_heads,
-unsigned batch,
-hipStream_t stream)
-{
-int total_count = batch * num_heads * seq_len;
-dim3 block_dims(1024);
-dim3 grid_dims((total_count - 1) / MAX_WARP_NUM + 1);  // (batch_size);
-hipLaunchKernelGGL((
-apply_rotary_pos_emb), dim3(grid_dims), dim3(block_dims), 0, stream, 
-mixed_query, key_layer, rotary_dim, seq_len, offset, num_heads, head_size, total_count);
-}
-
-template void launch_apply_rotary_pos_emb<float>(float*,
-float*,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-hipStream_t);
-template void launch_apply_rotary_pos_emb<__half>(__half*,
-__half*,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-hipStream_t);
-*/
diff --git a/deepspeed/ops/csrc/transformer_bak/inference/csrc/dequantize.cu b/deepspeed/ops/csrc/transformer_bak/inference/csrc/dequantize.cu
deleted file mode 100644
index 4ddaabda3eb70b1b958b1fc4c2f959867828d1a2..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/inference/csrc/dequantize.cu
+++ /dev/null
@@ -1,110 +0,0 @@
-#include "custom_cuda_layers.h"
-
-#define MAX_QUANTIZE_GROUPING 1024
-
-#define loop_unroll 1
-#define loop_unroll_bits 1
-
-__global__ void dequantize_kernel(float* output,
-                                  const int8_t* input,
-                                  const float* qscale,
-                                  int output_size,
-                                  int hidden_dim,
-                                  int groups,
-                                  int merge_count)
-{
-    unsigned merge_hidden = hidden_dim >> merge_count;
-    unsigned quantization_stride = (merge_hidden * output_size) / groups;
-
-    unsigned bid = blockIdx.x;
-    unsigned tid = threadIdx.x;
-
-    while (tid < output_size) {
-        unsigned w_index = bid / merge_hidden;
-        unsigned q_index = tid + bid * output_size;
-
-        auto q = input[q_index];
-
-        unsigned merge_hidden_total = w_index * merge_hidden;
-        unsigned scale_index =
-            ((((bid - merge_hidden_total) + tid * merge_hidden) / quantization_stride)
-             << merge_count) +
-            w_index;
-
-        float scale_data = qscale[scale_index];
-
-        output[q_index] = (scale_data * (float)q);
-        tid += blockDim.x;
-    }
-}
-
-__global__ void dequantize_kernel(__half* output,
-                                  const int8_t* input,
-                                  const float* qscale,
-                                  unsigned output_size,
-                                  unsigned hidden_dim,
-                                  unsigned groups,
-                                  unsigned merge_count)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    unsigned merge_hidden = hidden_dim >> merge_count;
-    unsigned quantization_stride = (merge_hidden * output_size) / groups;
-
-    unsigned bid = blockIdx.x;
-    unsigned tid = threadIdx.x;
-
-    while (tid < output_size) {
-        unsigned w_index = bid / merge_hidden;
-        unsigned q_index = tid + bid * output_size;
-
-        auto q = input[q_index];
-
-        unsigned merge_hidden_total = w_index * merge_hidden;
-        unsigned scale_index =
-            ((((bid - merge_hidden_total) + tid * merge_hidden) / quantization_stride)
-             << merge_count) +
-            w_index;
-
-        float scale_data = qscale[scale_index];
-
-        output[q_index] = __float2half(scale_data * (float)q);
-        tid += blockDim.x;
-    }
-#endif
-}
-
-template <typename T>
-void launch_dequantize(T* output,
-                       const int8_t* input,
-                       const float* qscale,
-                       unsigned output_size,
-                       unsigned hidden_dim,
-                       unsigned groups,
-                       unsigned merge_count,
-                       cudaStream_t stream)
-{
-    unsigned threads = 1024;
-    dim3 block_dims(threads);
-    dim3 grid_dims(hidden_dim);
-
-    dequantize_kernel<<<grid_dims, block_dims, 0, stream>>>(
-        output, input, qscale, output_size, hidden_dim, groups, merge_count);
-}
-
-template void launch_dequantize<float>(float*,
-                                       const int8_t*,
-                                       const float*,
-                                       unsigned,
-                                       unsigned,
-                                       unsigned,
-                                       unsigned,
-                                       cudaStream_t);
-template void launch_dequantize<__half>(__half*,
-                                        const int8_t*,
-                                        const float*,
-                                        unsigned,
-                                        unsigned,
-                                        unsigned,
-                                        unsigned,
-                                        cudaStream_t);
diff --git a/deepspeed/ops/csrc/transformer_bak/inference/csrc/dequantize.hip b/deepspeed/ops/csrc/transformer_bak/inference/csrc/dequantize.hip
deleted file mode 100644
index 7c22e306aace1058947ed47e58c0427a4f066ecb..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/inference/csrc/dequantize.hip
+++ /dev/null
@@ -1,112 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-
-#define MAX_QUANTIZE_GROUPING 1024
-
-#define loop_unroll 1
-#define loop_unroll_bits 1
-
-__global__ void dequantize_kernel(float* output,
-                                  const int8_t* input,
-                                  const float* qscale,
-                                  int output_size,
-                                  int hidden_dim,
-                                  int groups,
-                                  int merge_count)
-{
-    unsigned merge_hidden = hidden_dim >> merge_count;
-    unsigned quantization_stride = (merge_hidden * output_size) / groups;
-
-    unsigned bid = blockIdx.x;
-    unsigned tid = threadIdx.x;
-
-    while (tid < output_size) {
-        unsigned w_index = bid / merge_hidden;
-        unsigned q_index = tid + bid * output_size;
-
-        auto q = input[q_index];
-
-        unsigned merge_hidden_total = w_index * merge_hidden;
-        unsigned scale_index =
-            ((((bid - merge_hidden_total) + tid * merge_hidden) / quantization_stride)
-             << merge_count) +
-            w_index;
-
-        float scale_data = qscale[scale_index];
-
-        output[q_index] = (scale_data * (float)q);
-        tid += blockDim.x;
-    }
-}
-
-__global__ void dequantize_kernel(__half* output,
-                                  const int8_t* input,
-                                  const float* qscale,
-                                  unsigned output_size,
-                                  unsigned hidden_dim,
-                                  unsigned groups,
-                                  unsigned merge_count)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    unsigned merge_hidden = hidden_dim >> merge_count;
-    unsigned quantization_stride = (merge_hidden * output_size) / groups;
-
-    unsigned bid = blockIdx.x;
-    unsigned tid = threadIdx.x;
-
-    while (tid < output_size) {
-        unsigned w_index = bid / merge_hidden;
-        unsigned q_index = tid + bid * output_size;
-
-        auto q = input[q_index];
-
-        unsigned merge_hidden_total = w_index * merge_hidden;
-        unsigned scale_index =
-            ((((bid - merge_hidden_total) + tid * merge_hidden) / quantization_stride)
-             << merge_count) +
-            w_index;
-
-        float scale_data = qscale[scale_index];
-
-        output[q_index] = __float2half(scale_data * (float)q);
-        tid += blockDim.x;
-    }
-#endif
-}
-
-template <typename T>
-void launch_dequantize(T* output,
-                       const int8_t* input,
-                       const float* qscale,
-                       unsigned output_size,
-                       unsigned hidden_dim,
-                       unsigned groups,
-                       unsigned merge_count,
-                       hipStream_t stream)
-{
-    unsigned threads = 1024;
-    dim3 block_dims(threads);
-    dim3 grid_dims(hidden_dim);
-
-   hipLaunchKernelGGL(( dequantize_kernel), dim3(grid_dims), dim3(block_dims), 0, stream, 
-        output, input, qscale, output_size, hidden_dim, groups, merge_count);
-}
-
-template void launch_dequantize<float>(float*,
-                                       const int8_t*,
-                                       const float*,
-                                       unsigned,
-                                       unsigned,
-                                       unsigned,
-                                       unsigned,
-                                       hipStream_t);
-template void launch_dequantize<__half>(__half*,
-                                        const int8_t*,
-                                        const float*,
-                                        unsigned,
-                                        unsigned,
-                                        unsigned,
-                                        unsigned,
-                                        hipStream_t);
diff --git a/deepspeed/ops/csrc/transformer_bak/inference/csrc/gelu.cu b/deepspeed/ops/csrc/transformer_bak/inference/csrc/gelu.cu
deleted file mode 100644
index 70bbf42cf9ed74558ce1b789d939c17d38573a86..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/inference/csrc/gelu.cu
+++ /dev/null
@@ -1,525 +0,0 @@
-#include "custom_cuda_layers.h"
-
-#define MAX_CAP 4
-#define MAX_SEQ 2048
-
-inline __device__ float gelu(const float x)
-{
-    const float sqrt_param = 0.79788456080286535587989211986876f;
-    const float mul_param = 0.044715;
-    return x * 0.5f * (1.0f + tanhf(sqrt_param * (x + mul_param * x * x * x)));
-}
-
-__global__ void fused_bias_gelu(float* input,
-                                const float* bias,
-                                int total_count,
-                                int intermediate_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 bias_data = bias_cast[offset % intermediate_size];
-
-        data.x += bias_data.x;
-        data.y += bias_data.y;
-        data.z += bias_data.z;
-        data.w += bias_data.w;
-
-        data.x = gelu(data.x);
-        data.y = gelu(data.y);
-        data.z = gelu(data.z);
-        data.w = gelu(data.w);
-
-        input_cast[offset] = data;
-    }
-}
-
-__global__ void fused_bias_gelu(__half* input,
-                                const __half* bias,
-                                int total_count,
-                                int intermediate_size)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 bias_vec = bias_cast[offset % intermediate_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        low_data.x += low_bias.x;
-        low_data.y += low_bias.y;
-        high_data.x += high_bias.x;
-        high_data.y += high_bias.y;
-
-        low_data.x = gelu(low_data.x);
-        low_data.y = gelu(low_data.y);
-        high_data.x = gelu(high_data.x);
-        high_data.y = gelu(high_data.y);
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        input_cast[offset] = vals_vec;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_gelu(T* input,
-                      const T* bias,
-                      int intermediate_size,
-                      int batch_size,
-                      cudaStream_t stream)
-{
-    int total_count = batch_size * (intermediate_size / 4);
-    int threads = 1024;  // intermediate_size / iterations / 4;
-    dim3 block_dims(threads);
-    dim3 grid_dims(((total_count - 1) / 1024 + 1));  // (batch_size);
-
-    fused_bias_gelu<<<grid_dims, block_dims, 0, stream>>>(
-        input, bias, total_count, intermediate_size / 4);
-}
-
-template void launch_bias_gelu<float>(float*, const float*, int, int, cudaStream_t);
-template void launch_bias_gelu<__half>(__half*, const __half*, int, int, cudaStream_t);
-
-__global__ void fused_bias_add(float* input, const float* bias, int total_count, int hidden_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 bias_data = bias_cast[offset % hidden_size];
-
-        data.x += bias_data.x;
-        data.y += bias_data.y;
-        data.z += bias_data.z;
-        data.w += bias_data.w;
-
-        input_cast[offset] = data;
-    }
-}
-
-__global__ void fused_bias_add(__half* input, const __half* bias, int total_count, int hidden_size)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 bias_vec = bias_cast[offset % hidden_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        low_data.x += low_bias.x;
-        low_data.y += low_bias.y;
-        high_data.x += high_bias.x;
-        high_data.y += high_bias.y;
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        input_cast[offset] = vals_vec;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, cudaStream_t stream)
-{
-    int total_count = batch_size * (hidden_size / 4);
-    int threads = 1024;  // hidden_size / iterations / 4;
-    dim3 block_dims(threads);
-    dim3 grid_dims(((total_count - 1) / threads + 1));  // (batch_size);
-
-    fused_bias_add<<<grid_dims, block_dims, 0, stream>>>(input, bias, total_count, hidden_size / 4);
-}
-
-template void launch_bias_add<float>(float*, const float*, int, int, cudaStream_t);
-template void launch_bias_add<__half>(__half*, const __half*, int, int, cudaStream_t);
-
-__global__ void fused_bias_residual(float* input,
-                                    float* output,
-                                    float* attn,
-                                    float* bias,
-                                    float* attnbias,
-                                    int total_count,
-                                    int intermediate_size,
-                                    int mp_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    float4* output_cast = reinterpret_cast<float4*>(output);
-    float4* attn_cast = reinterpret_cast<float4*>(attn);
-    float4* bias_cast = reinterpret_cast<float4*>(bias);
-    float4* attnbias_cast = reinterpret_cast<float4*>(attnbias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 out = output_cast[offset];
-        float4 res_vec = attn_cast[offset];
-        float4 bias_data = bias_cast[offset % intermediate_size];
-        float4 attn_bias = attnbias_cast[offset % intermediate_size];
-
-        data.x = (data.x + res_vec.x) * mp_size + (out.x + bias_data.x + attn_bias.x);
-        data.y = (data.y + res_vec.y) * mp_size + (out.y + bias_data.y + attn_bias.y);
-        data.z = (data.z + res_vec.z) * mp_size + (out.z + bias_data.z + attn_bias.z);
-        data.w = (data.w + res_vec.w) * mp_size + (out.w + bias_data.w + attn_bias.w);
-
-        output_cast[offset] = data;
-    }
-}
-
-__global__ void fused_bias_residual(__half* input,
-                                    __half* output,
-                                    __half* attn,
-                                    __half* bias,
-                                    __half* attn_bias,
-                                    int total_count,
-                                    int intermediate_size,
-                                    int mp_size)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    float2* output_cast = reinterpret_cast<float2*>(output);
-    float2* attn_cast = reinterpret_cast<float2*>(attn);
-
-    float2* bias_cast = reinterpret_cast<float2*>(bias);
-    float2* attnbias_cast = reinterpret_cast<float2*>(attn_bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 out_vec = output_cast[offset];
-        float2 res_vec = attn_cast[offset];
-
-        float2 bias_vec = bias_cast[offset % intermediate_size];
-        float2 attn_bias_vec = attnbias_cast[offset % intermediate_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* out_half = reinterpret_cast<__half2*>(&out_vec);
-        __half2* res_half = reinterpret_cast<__half2*>(&res_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-        __half2* attnbias_half = reinterpret_cast<__half2*>(&attn_bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_out = __half22float2(out_half[0]);
-        float2 high_out = __half22float2(out_half[1]);
-
-        float2 low_res = __half22float2(res_half[0]);
-        float2 high_res = __half22float2(res_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        float2 attn_low_bias = __half22float2(attnbias_half[0]);
-        float2 attn_high_bias = __half22float2(attnbias_half[1]);
-
-        low_data.x =
-            (low_data.x + low_res.x) * mp_size + (low_out.x + (low_bias.x + attn_low_bias.x));
-        low_data.y =
-            (low_data.y + low_res.y) * mp_size + (low_out.y + (low_bias.y + attn_low_bias.y));
-        high_data.x =
-            (high_data.x + high_res.x) * mp_size + (high_out.x + (high_bias.x + attn_high_bias.x));
-        high_data.y =
-            (high_data.y + high_res.y) * mp_size + (high_out.y + (high_bias.y + attn_high_bias.y));
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        output_cast[offset] = vals_vec;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_residual(T* input,
-                          T* output,
-                          T* attn,
-                          T* bias,
-                          T* attn_bias,
-                          int batch,
-                          int hidden_dim,
-                          int mp_size,
-                          cudaStream_t stream)
-{
-    int total_count = batch * hidden_dim / 4;
-    dim3 block_dims(1024);
-    dim3 grid_dims((total_count - 1) / 1024 + 1);  // (batch_size);
-
-    fused_bias_residual<<<grid_dims, block_dims, 0, stream>>>(
-        input, output, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size);
-}
-
-template void
-launch_bias_residual<float>(float*, float*, float*, float*, float*, int, int, int, cudaStream_t);
-template void launch_bias_residual<__half>(__half*,
-                                           __half*,
-                                           __half*,
-                                           __half*,
-                                           __half*,
-                                           int,
-                                           int,
-                                           int,
-                                           cudaStream_t);
-
-__global__ void gptj_residual_add(float* input,
-                                  float* output,
-                                  float* attn,
-                                  float* bias,
-                                  float* attnbias,
-                                  int total_count,
-                                  int intermediate_size,
-                                  float mp_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    float4* output_cast = reinterpret_cast<float4*>(output);
-    float4* attn_cast = reinterpret_cast<float4*>(attn);
-    float4* bias_cast = reinterpret_cast<float4*>(bias);
-    float4* attnbias_cast = reinterpret_cast<float4*>(attnbias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 out = output_cast[offset];
-        float4 res_vec = attn_cast[offset];
-        float4 bias_data = bias_cast[offset % intermediate_size];
-        float4 attn_bias = attnbias_cast[offset % intermediate_size];
-
-        data.x = data.x * mp_size + (out.x + res_vec.x + bias_data.x + attn_bias.x);
-        data.y = data.y * mp_size + (out.y + res_vec.y + bias_data.y + attn_bias.y);
-        data.z = data.z * mp_size + (out.z + res_vec.z + bias_data.z + attn_bias.z);
-        data.w = data.w * mp_size + (out.w + res_vec.w + bias_data.w + attn_bias.w);
-
-        output_cast[offset] = data;
-    }
-}
-
-__global__ void gptj_residual_add(__half* input,
-                                  __half* output,
-                                  __half* attn,
-                                  __half* bias,
-                                  __half* attn_bias,
-                                  int total_count,
-                                  int intermediate_size,
-                                  float mp_size)
-{
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    float2* output_cast = reinterpret_cast<float2*>(output);
-    float2* attn_cast = reinterpret_cast<float2*>(attn);
-
-    float2* bias_cast = reinterpret_cast<float2*>(bias);
-    float2* attnbias_cast = reinterpret_cast<float2*>(attn_bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 out_vec = output_cast[offset];
-        float2 res_vec = attn_cast[offset];
-
-        float2 bias_vec = bias_cast[offset % intermediate_size];
-        float2 attn_bias_vec = attnbias_cast[offset % intermediate_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* out_half = reinterpret_cast<__half2*>(&out_vec);
-        __half2* res_half = reinterpret_cast<__half2*>(&res_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-        __half2* attnbias_half = reinterpret_cast<__half2*>(&attn_bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_out = __half22float2(out_half[0]);
-        float2 high_out = __half22float2(out_half[1]);
-
-        float2 low_res = __half22float2(res_half[0]);
-        float2 high_res = __half22float2(res_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        float2 attn_low_bias = __half22float2(attnbias_half[0]);
-        float2 attn_high_bias = __half22float2(attnbias_half[1]);
-
-        low_data.x =
-            low_data.x * mp_size + (low_out.x + low_res.x + (low_bias.x + attn_low_bias.x));
-        low_data.y =
-            low_data.y * mp_size + (low_out.y + low_res.y + (low_bias.y + attn_low_bias.y));
-        high_data.x =
-            high_data.x * mp_size + (high_out.x + high_res.x + (high_bias.x + attn_high_bias.x));
-        high_data.y =
-            high_data.y * mp_size + (high_out.y + high_res.y + (high_bias.y + attn_high_bias.y));
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        output_cast[offset] = vals_vec;
-    }
-#endif
-}
-
-template <typename T>
-void launch_gptj_residual_add(T* input,
-                              T* output,
-                              T* attn,
-                              T* bias,
-                              T* attn_bias,
-                              int hidden_dim,
-                              int batch,
-                              int mp_size,
-                              cudaStream_t stream)
-{
-    int total_count = batch * hidden_dim / 4;
-    dim3 block_dims(1024);
-    dim3 grid_dims((total_count - 1) / 1024 + 1);  // (batch_size);
-
-    gptj_residual_add<<<grid_dims, block_dims, 0, stream>>>(
-        input, output, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size);
-}
-
-template void launch_gptj_residual_add<float>(float*,
-                                              float*,
-                                              float*,
-                                              float*,
-                                              float*,
-                                              int,
-                                              int,
-                                              int,
-                                              cudaStream_t);
-template void launch_gptj_residual_add<__half>(__half*,
-                                               __half*,
-                                               __half*,
-                                               __half*,
-                                               __half*,
-                                               int,
-                                               int,
-                                               int,
-                                               cudaStream_t);
-
-__global__ void moe_res_matmul(float* residual,
-                               float* coef,
-                               float* mlp_out,
-                               int seq_len,
-                               int hidden_dim)
-{
-    unsigned tid = threadIdx.x;
-    float4* residual_cast = reinterpret_cast<float4*>(residual);
-    float4* coef_cast = reinterpret_cast<float4*>(coef);
-    float4* mlp_out_cast = reinterpret_cast<float4*>(mlp_out);
-
-    residual_cast += blockIdx.x * hidden_dim;
-    mlp_out_cast += blockIdx.x * hidden_dim;
-
-    float4* coef_cast2 = coef_cast + hidden_dim;
-
-    while (tid < hidden_dim) {
-        float4 res = residual_cast[tid];
-        float4 mlp = mlp_out_cast[tid];
-        float4 coef1 = coef_cast[tid];
-        float4 coef2 = coef_cast2[tid];
-        mlp.x = mlp.x * coef2.x + res.x * coef1.x;
-        mlp.y = mlp.y * coef2.y + res.y * coef1.y;
-        mlp.z = mlp.z * coef2.z + res.z * coef1.z;
-        mlp.w = mlp.w * coef2.w + res.w * coef1.w;
-        mlp_out_cast[tid] = mlp;
-        tid += blockDim.x;
-    }
-}
-
-__global__ void moe_res_matmul(__half* residual,
-                               __half* coef,
-                               __half* mlp_out,
-                               int seq_len,
-                               int hidden_dim)
-{
-    unsigned tid = threadIdx.x;
-
-    float2* residual_cast = reinterpret_cast<float2*>(residual);
-    float2* mlp_out_cast = reinterpret_cast<float2*>(mlp_out);
-    float2* coef_cast = reinterpret_cast<float2*>(coef);
-    float2* coef_cast2 = coef_cast + hidden_dim;
-
-    residual_cast += blockIdx.x * hidden_dim;
-    mlp_out_cast += blockIdx.x * hidden_dim;
-
-    while (tid < hidden_dim) {
-        float2 res = residual_cast[tid];
-        float2 coef1 = coef_cast[tid];
-        float2 coef2 = coef_cast[tid];
-        float2 data = mlp_out_cast[tid];
-        __half* data_h = reinterpret_cast<__half*>(&data);
-        __half* coef1_h = reinterpret_cast<__half*>(&coef1);
-        __half* coef2_h = reinterpret_cast<__half*>(&coef2);
-        __half* res_h = reinterpret_cast<__half*>(&res);
-        data_h[0] = res_h[0] * coef1_h[0] + data_h[0] * coef2_h[0];
-        data_h[1] = res_h[1] * coef1_h[1] + data_h[1] * coef2_h[1];
-        data_h[2] = res_h[2] * coef1_h[2] + data_h[2] * coef2_h[2];
-        data_h[3] = res_h[3] * coef1_h[3] + data_h[3] * coef2_h[3];
-
-        mlp_out_cast[tid] = data;
-        tid += blockDim.x;
-    }
-}
-
-template <typename T>
-void launch_moe_res_matmul(T* residual,
-                           T* coef,
-                           T* mlp_out,
-                           int seq_len,
-                           int hidden_dim,
-                           cudaStream_t stream)
-{
-    dim3 grid_dim(seq_len);
-    dim3 block_dim(1024);
-    moe_res_matmul<<<grid_dim, block_dim, 0, stream>>>(
-        residual, coef, mlp_out, seq_len, hidden_dim / 4);
-}
-
-template void launch_moe_res_matmul(float* residual,
-                                    float* coef,
-                                    float* mlp_out,
-                                    int seq_len,
-                                    int hidden_dim,
-                                    cudaStream_t stream);
-template void launch_moe_res_matmul(__half* residual,
-                                    __half* coef,
-                                    __half* mlp_out,
-                                    int seq_len,
-                                    int hidden_dim,
-                                    cudaStream_t stream);
diff --git a/deepspeed/ops/csrc/transformer_bak/inference/csrc/gelu.hip b/deepspeed/ops/csrc/transformer_bak/inference/csrc/gelu.hip
deleted file mode 100644
index 00c03efb9b6b3d7b05f19549472b5b771f46e1f4..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/inference/csrc/gelu.hip
+++ /dev/null
@@ -1,527 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-
-#define MAX_CAP 4
-#define MAX_SEQ 2048
-
-inline __device__ float gelu(const float x)
-{
-    const float sqrt_param = 0.79788456080286535587989211986876f;
-    const float mul_param = 0.044715;
-    return x * 0.5f * (1.0f + tanhf(sqrt_param * (x + mul_param * x * x * x)));
-}
-
-__global__ void fused_bias_gelu(float* input,
-                                const float* bias,
-                                int total_count,
-                                int intermediate_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 bias_data = bias_cast[offset % intermediate_size];
-
-        data.x += bias_data.x;
-        data.y += bias_data.y;
-        data.z += bias_data.z;
-        data.w += bias_data.w;
-
-        data.x = gelu(data.x);
-        data.y = gelu(data.y);
-        data.z = gelu(data.z);
-        data.w = gelu(data.w);
-
-        input_cast[offset] = data;
-    }
-}
-
-__global__ void fused_bias_gelu(__half* input,
-                                const __half* bias,
-                                int total_count,
-                                int intermediate_size)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 bias_vec = bias_cast[offset % intermediate_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        low_data.x += low_bias.x;
-        low_data.y += low_bias.y;
-        high_data.x += high_bias.x;
-        high_data.y += high_bias.y;
-
-        low_data.x = gelu(low_data.x);
-        low_data.y = gelu(low_data.y);
-        high_data.x = gelu(high_data.x);
-        high_data.y = gelu(high_data.y);
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        input_cast[offset] = vals_vec;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_gelu(T* input,
-                      const T* bias,
-                      int intermediate_size,
-                      int batch_size,
-                      hipStream_t stream)
-{
-    int total_count = batch_size * (intermediate_size / 4);
-    int threads = 1024;  // intermediate_size / iterations / 4;
-    dim3 block_dims(threads);
-    dim3 grid_dims(((total_count - 1) / 1024 + 1));  // (batch_size);
-
-   hipLaunchKernelGGL(( fused_bias_gelu), dim3(grid_dims), dim3(block_dims), 0, stream, 
-        input, bias, total_count, intermediate_size / 4);
-}
-
-template void launch_bias_gelu<float>(float*, const float*, int, int, hipStream_t);
-template void launch_bias_gelu<__half>(__half*, const __half*, int, int, hipStream_t);
-
-__global__ void fused_bias_add(float* input, const float* bias, int total_count, int hidden_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 bias_data = bias_cast[offset % hidden_size];
-
-        data.x += bias_data.x;
-        data.y += bias_data.y;
-        data.z += bias_data.z;
-        data.w += bias_data.w;
-
-        input_cast[offset] = data;
-    }
-}
-
-__global__ void fused_bias_add(__half* input, const __half* bias, int total_count, int hidden_size)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 bias_vec = bias_cast[offset % hidden_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        low_data.x += low_bias.x;
-        low_data.y += low_bias.y;
-        high_data.x += high_bias.x;
-        high_data.y += high_bias.y;
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        input_cast[offset] = vals_vec;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, hipStream_t stream)
-{
-    int total_count = batch_size * (hidden_size / 4);
-    int threads = 1024;  // hidden_size / iterations / 4;
-    dim3 block_dims(threads);
-    dim3 grid_dims(((total_count - 1) / threads + 1));  // (batch_size);
-
-   hipLaunchKernelGGL(( fused_bias_add), dim3(grid_dims), dim3(block_dims), 0, stream, input, bias, total_count, hidden_size / 4);
-}
-
-template void launch_bias_add<float>(float*, const float*, int, int, hipStream_t);
-template void launch_bias_add<__half>(__half*, const __half*, int, int, hipStream_t);
-
-__global__ void fused_bias_residual(float* input,
-                                    float* output,
-                                    float* attn,
-                                    float* bias,
-                                    float* attnbias,
-                                    int total_count,
-                                    int intermediate_size,
-                                    int mp_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    float4* output_cast = reinterpret_cast<float4*>(output);
-    float4* attn_cast = reinterpret_cast<float4*>(attn);
-    float4* bias_cast = reinterpret_cast<float4*>(bias);
-    float4* attnbias_cast = reinterpret_cast<float4*>(attnbias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 out = output_cast[offset];
-        float4 res_vec = attn_cast[offset];
-        float4 bias_data = bias_cast[offset % intermediate_size];
-        float4 attn_bias = attnbias_cast[offset % intermediate_size];
-
-        data.x = (data.x + res_vec.x) * mp_size + (out.x + bias_data.x + attn_bias.x);
-        data.y = (data.y + res_vec.y) * mp_size + (out.y + bias_data.y + attn_bias.y);
-        data.z = (data.z + res_vec.z) * mp_size + (out.z + bias_data.z + attn_bias.z);
-        data.w = (data.w + res_vec.w) * mp_size + (out.w + bias_data.w + attn_bias.w);
-
-        output_cast[offset] = data;
-    }
-}
-
-__global__ void fused_bias_residual(__half* input,
-                                    __half* output,
-                                    __half* attn,
-                                    __half* bias,
-                                    __half* attn_bias,
-                                    int total_count,
-                                    int intermediate_size,
-                                    int mp_size)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    float2* output_cast = reinterpret_cast<float2*>(output);
-    float2* attn_cast = reinterpret_cast<float2*>(attn);
-
-    float2* bias_cast = reinterpret_cast<float2*>(bias);
-    float2* attnbias_cast = reinterpret_cast<float2*>(attn_bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 out_vec = output_cast[offset];
-        float2 res_vec = attn_cast[offset];
-
-        float2 bias_vec = bias_cast[offset % intermediate_size];
-        float2 attn_bias_vec = attnbias_cast[offset % intermediate_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* out_half = reinterpret_cast<__half2*>(&out_vec);
-        __half2* res_half = reinterpret_cast<__half2*>(&res_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-        __half2* attnbias_half = reinterpret_cast<__half2*>(&attn_bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_out = __half22float2(out_half[0]);
-        float2 high_out = __half22float2(out_half[1]);
-
-        float2 low_res = __half22float2(res_half[0]);
-        float2 high_res = __half22float2(res_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        float2 attn_low_bias = __half22float2(attnbias_half[0]);
-        float2 attn_high_bias = __half22float2(attnbias_half[1]);
-
-        low_data.x =
-            (low_data.x + low_res.x) * mp_size + (low_out.x + (low_bias.x + attn_low_bias.x));
-        low_data.y =
-            (low_data.y + low_res.y) * mp_size + (low_out.y + (low_bias.y + attn_low_bias.y));
-        high_data.x =
-            (high_data.x + high_res.x) * mp_size + (high_out.x + (high_bias.x + attn_high_bias.x));
-        high_data.y =
-            (high_data.y + high_res.y) * mp_size + (high_out.y + (high_bias.y + attn_high_bias.y));
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        output_cast[offset] = vals_vec;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_residual(T* input,
-                          T* output,
-                          T* attn,
-                          T* bias,
-                          T* attn_bias,
-                          int batch,
-                          int hidden_dim,
-                          int mp_size,
-                          hipStream_t stream)
-{
-    int total_count = batch * hidden_dim / 4;
-    dim3 block_dims(1024);
-    dim3 grid_dims((total_count - 1) / 1024 + 1);  // (batch_size);
-
-   hipLaunchKernelGGL(( fused_bias_residual), dim3(grid_dims), dim3(block_dims), 0, stream, 
-        input, output, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size);
-}
-
-template void
-launch_bias_residual<float>(float*, float*, float*, float*, float*, int, int, int, hipStream_t);
-template void launch_bias_residual<__half>(__half*,
-                                           __half*,
-                                           __half*,
-                                           __half*,
-                                           __half*,
-                                           int,
-                                           int,
-                                           int,
-                                           hipStream_t);
-
-__global__ void gptj_residual_add(float* input,
-                                  float* output,
-                                  float* attn,
-                                  float* bias,
-                                  float* attnbias,
-                                  int total_count,
-                                  int intermediate_size,
-                                  float mp_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    float4* output_cast = reinterpret_cast<float4*>(output);
-    float4* attn_cast = reinterpret_cast<float4*>(attn);
-    float4* bias_cast = reinterpret_cast<float4*>(bias);
-    float4* attnbias_cast = reinterpret_cast<float4*>(attnbias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 out = output_cast[offset];
-        float4 res_vec = attn_cast[offset];
-        float4 bias_data = bias_cast[offset % intermediate_size];
-        float4 attn_bias = attnbias_cast[offset % intermediate_size];
-
-        data.x = data.x * mp_size + (out.x + res_vec.x + bias_data.x + attn_bias.x);
-        data.y = data.y * mp_size + (out.y + res_vec.y + bias_data.y + attn_bias.y);
-        data.z = data.z * mp_size + (out.z + res_vec.z + bias_data.z + attn_bias.z);
-        data.w = data.w * mp_size + (out.w + res_vec.w + bias_data.w + attn_bias.w);
-
-        output_cast[offset] = data;
-    }
-}
-
-__global__ void gptj_residual_add(__half* input,
-                                  __half* output,
-                                  __half* attn,
-                                  __half* bias,
-                                  __half* attn_bias,
-                                  int total_count,
-                                  int intermediate_size,
-                                  float mp_size)
-{
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    float2* output_cast = reinterpret_cast<float2*>(output);
-    float2* attn_cast = reinterpret_cast<float2*>(attn);
-
-    float2* bias_cast = reinterpret_cast<float2*>(bias);
-    float2* attnbias_cast = reinterpret_cast<float2*>(attn_bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 out_vec = output_cast[offset];
-        float2 res_vec = attn_cast[offset];
-
-        float2 bias_vec = bias_cast[offset % intermediate_size];
-        float2 attn_bias_vec = attnbias_cast[offset % intermediate_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* out_half = reinterpret_cast<__half2*>(&out_vec);
-        __half2* res_half = reinterpret_cast<__half2*>(&res_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-        __half2* attnbias_half = reinterpret_cast<__half2*>(&attn_bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_out = __half22float2(out_half[0]);
-        float2 high_out = __half22float2(out_half[1]);
-
-        float2 low_res = __half22float2(res_half[0]);
-        float2 high_res = __half22float2(res_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        float2 attn_low_bias = __half22float2(attnbias_half[0]);
-        float2 attn_high_bias = __half22float2(attnbias_half[1]);
-
-        low_data.x =
-            low_data.x * mp_size + (low_out.x + low_res.x + (low_bias.x + attn_low_bias.x));
-        low_data.y =
-            low_data.y * mp_size + (low_out.y + low_res.y + (low_bias.y + attn_low_bias.y));
-        high_data.x =
-            high_data.x * mp_size + (high_out.x + high_res.x + (high_bias.x + attn_high_bias.x));
-        high_data.y =
-            high_data.y * mp_size + (high_out.y + high_res.y + (high_bias.y + attn_high_bias.y));
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        output_cast[offset] = vals_vec;
-    }
-#endif
-}
-
-template <typename T>
-void launch_gptj_residual_add(T* input,
-                              T* output,
-                              T* attn,
-                              T* bias,
-                              T* attn_bias,
-                              int hidden_dim,
-                              int batch,
-                              int mp_size,
-                              hipStream_t stream)
-{
-    int total_count = batch * hidden_dim / 4;
-    dim3 block_dims(1024);
-    dim3 grid_dims((total_count - 1) / 1024 + 1);  // (batch_size);
-
-   hipLaunchKernelGGL(( gptj_residual_add), dim3(grid_dims), dim3(block_dims), 0, stream, 
-        input, output, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size);
-}
-
-template void launch_gptj_residual_add<float>(float*,
-                                              float*,
-                                              float*,
-                                              float*,
-                                              float*,
-                                              int,
-                                              int,
-                                              int,
-                                              hipStream_t);
-template void launch_gptj_residual_add<__half>(__half*,
-                                               __half*,
-                                               __half*,
-                                               __half*,
-                                               __half*,
-                                               int,
-                                               int,
-                                               int,
-                                               hipStream_t);
-
-__global__ void moe_res_matmul(float* residual,
-                               float* coef,
-                               float* mlp_out,
-                               int seq_len,
-                               int hidden_dim)
-{
-    unsigned tid = threadIdx.x;
-    float4* residual_cast = reinterpret_cast<float4*>(residual);
-    float4* coef_cast = reinterpret_cast<float4*>(coef);
-    float4* mlp_out_cast = reinterpret_cast<float4*>(mlp_out);
-
-    residual_cast += blockIdx.x * hidden_dim;
-    mlp_out_cast += blockIdx.x * hidden_dim;
-
-    float4* coef_cast2 = coef_cast + hidden_dim;
-
-    while (tid < hidden_dim) {
-        float4 res = residual_cast[tid];
-        float4 mlp = mlp_out_cast[tid];
-        float4 coef1 = coef_cast[tid];
-        float4 coef2 = coef_cast2[tid];
-        mlp.x = mlp.x * coef2.x + res.x * coef1.x;
-        mlp.y = mlp.y * coef2.y + res.y * coef1.y;
-        mlp.z = mlp.z * coef2.z + res.z * coef1.z;
-        mlp.w = mlp.w * coef2.w + res.w * coef1.w;
-        mlp_out_cast[tid] = mlp;
-        tid += blockDim.x;
-    }
-}
-
-__global__ void moe_res_matmul(__half* residual,
-                               __half* coef,
-                               __half* mlp_out,
-                               int seq_len,
-                               int hidden_dim)
-{
-    unsigned tid = threadIdx.x;
-
-    float2* residual_cast = reinterpret_cast<float2*>(residual);
-    float2* mlp_out_cast = reinterpret_cast<float2*>(mlp_out);
-    float2* coef_cast = reinterpret_cast<float2*>(coef);
-    float2* coef_cast2 = coef_cast + hidden_dim;
-
-    residual_cast += blockIdx.x * hidden_dim;
-    mlp_out_cast += blockIdx.x * hidden_dim;
-
-    while (tid < hidden_dim) {
-        float2 res = residual_cast[tid];
-        float2 coef1 = coef_cast[tid];
-        float2 coef2 = coef_cast[tid];
-        float2 data = mlp_out_cast[tid];
-        __half* data_h = reinterpret_cast<__half*>(&data);
-        __half* coef1_h = reinterpret_cast<__half*>(&coef1);
-        __half* coef2_h = reinterpret_cast<__half*>(&coef2);
-        __half* res_h = reinterpret_cast<__half*>(&res);
-        data_h[0] = res_h[0] * coef1_h[0] + data_h[0] * coef2_h[0];
-        data_h[1] = res_h[1] * coef1_h[1] + data_h[1] * coef2_h[1];
-        data_h[2] = res_h[2] * coef1_h[2] + data_h[2] * coef2_h[2];
-        data_h[3] = res_h[3] * coef1_h[3] + data_h[3] * coef2_h[3];
-
-        mlp_out_cast[tid] = data;
-        tid += blockDim.x;
-    }
-}
-
-template <typename T>
-void launch_moe_res_matmul(T* residual,
-                           T* coef,
-                           T* mlp_out,
-                           int seq_len,
-                           int hidden_dim,
-                           hipStream_t stream)
-{
-    dim3 grid_dim(seq_len);
-    dim3 block_dim(1024);
-   hipLaunchKernelGGL(( moe_res_matmul), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        residual, coef, mlp_out, seq_len, hidden_dim / 4);
-}
-
-template void launch_moe_res_matmul(float* residual,
-                                    float* coef,
-                                    float* mlp_out,
-                                    int seq_len,
-                                    int hidden_dim,
-                                    hipStream_t stream);
-template void launch_moe_res_matmul(__half* residual,
-                                    __half* coef,
-                                    __half* mlp_out,
-                                    int seq_len,
-                                    int hidden_dim,
-                                    hipStream_t stream);
diff --git a/deepspeed/ops/csrc/transformer_bak/inference/csrc/normalize.cu b/deepspeed/ops/csrc/transformer_bak/inference/csrc/normalize.cu
deleted file mode 100644
index 1d691394ed5e9cb1568b10b542bbe052566d3ee8..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/inference/csrc/normalize.cu
+++ /dev/null
@@ -1,451 +0,0 @@
-#include <limits>
-#include "custom_cuda_layers.h"
-
-//#include <cuda_profiler_api.h>
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-
-#define NORM_REG (MAX_REGISTERS)
-
-namespace cg = cooperative_groups;
-
-__global__ void fused_bias_residual_layer_norm(float* output,
-                                               const float* vals,
-                                               const float* gamma,
-                                               const float* beta,
-                                               float epsilon,
-                                               int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    float inp_reg[NORM_REG];
-
-    int k = 0;
-    float sum = 0;
-    int input_id = id;
-    while (input_id < row_stride) {
-        inp_reg[k] = vals[input_id + row * row_stride];
-        sum += inp_reg[k++];
-        input_id += iteration_stride;
-    }
-
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-
-    __shared__ float shr[MAX_WARP_NUM];
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-
-    float mean = sum / (row_stride);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        inp_reg[f] -= mean;
-        sum += inp_reg[f] * inp_reg[f];
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * sum;
-        inp_reg[f] = inp_reg[f] * gamma[out_id] + beta[out_id];
-        output[out_id + row * row_stride] = inp_reg[f];
-    }
-}
-
-__global__ void fused_bias_residual_layer_norm(__half* output,
-                                               const __half* vals,
-                                               const __half* gamma,
-                                               const __half* beta,
-                                               float epsilon,
-                                               int row_stride)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    __half2 inp_reg[NORM_REG];
-
-    const __half2* vals_cast = reinterpret_cast<const __half2*>(vals);
-    __half2* out_cast = reinterpret_cast<__half2*>(output);
-
-    int k = 0;
-    int input_id = id;
-    while (input_id < row_stride) {
-        inp_reg[k++] = vals_cast[input_id + row * row_stride];
-        input_id += iteration_stride;
-    }
-    float sum = 0;
-    for (int f = k - 1; f >= 0; f--) {
-        float2 inp_f = __half22float2(inp_reg[f]);
-        sum += inp_f.x + inp_f.y;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    __shared__ float shr[MAX_WARP_NUM];
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride << 1);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        float2 inp_f = __half22float2(inp_reg[f]);
-        inp_f.x -= mean;
-        inp_f.y -= mean;
-        inp_reg[f] = __float22half2_rn(inp_f);
-        sum += inp_f.x * inp_f.x;
-        sum += inp_f.y * inp_f.y;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride << 1);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-    __half2 variance_h = __float2half2_rn(sum);
-    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
-    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * variance_h;
-        inp_reg[f] = inp_reg[f] * gamma_cast[out_id] + beta_cast[out_id];
-        out_cast[out_id + row * row_stride] = inp_reg[f];
-    }
-#endif
-}
-
-template <typename T>
-void launch_layer_norm(T* out,
-                       T* vals,
-                       const T* gamma,
-                       const T* beta,
-                       float epsilon,
-                       int batch_size,
-                       int hidden_dim,
-                       cudaStream_t stream);
-
-template <>
-void launch_layer_norm<float>(float* out,
-                              float* vals,
-                              const float* gamma,
-                              const float* beta,
-                              float epsilon,
-                              int batch_size,
-                              int hidden_dim,
-                              cudaStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-
-    dim3 block_dim(threads);
-
-    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
-        out, vals, gamma, beta, epsilon, hidden_dim);
-}
-
-template <>
-void launch_layer_norm<__half>(__half* out,
-                               __half* vals,
-                               const __half* gamma,
-                               const __half* beta,
-                               float epsilon,
-                               int batch_size,
-                               int hidden_dim,
-                               cudaStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-    dim3 block_dim(threads);
-
-    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
-        out, vals, gamma, beta, epsilon, hidden_dim / 2);
-}
-
-__global__ void fused_residual_layer_norm(float* norm,
-                                          float* res_add,
-                                          float* vals,
-                                          float* residual,
-                                          const float* bias,
-                                          const float* gamma,
-                                          const float* beta,
-                                          float epsilon,
-                                          int row_stride,
-                                          bool preLN,
-                                          bool mlp_after_attn)
-{
-    int iteration_stride = blockDim.x;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    float inp_reg[NORM_REG];
-
-    int k = 0;
-    int input_id = id;
-
-    float sum = 0;
-    while (input_id < row_stride) {
-        inp_reg[k] = vals[input_id + row * row_stride];
-        float res_f = (residual[input_id + row * row_stride]);
-        float bias_f = (bias[input_id]);
-        if (mlp_after_attn) inp_reg[k] += res_f + bias_f;
-        // if (preLN) res_add[input_id + row * row_stride] = inp_reg[k];
-        sum += inp_reg[k++];
-        input_id += iteration_stride;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-
-    __shared__ float shr[MAX_WARP_NUM];
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        inp_reg[f] -= mean;
-        sum += inp_reg[f] * inp_reg[f];
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * sum;
-        inp_reg[f] = inp_reg[f] * gamma[out_id] + beta[out_id];
-        norm[out_id + row * row_stride] = inp_reg[f];
-    }
-}
-
-__global__ void fused_residual_layer_norm(__half* norm,
-                                          __half* res_add,
-                                          __half* vals,
-                                          __half* residual,
-                                          const __half* bias,
-                                          const __half* gamma,
-                                          const __half* beta,
-                                          float epsilon,
-                                          int row_stride,
-                                          bool preLN,
-                                          bool mlp_after_attn)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int iteration_stride = blockDim.x;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    __half2 inp_reg[NORM_REG];
-
-    __half2* vals_cast = reinterpret_cast<__half2*>(vals);
-    __half2* norm_cast = reinterpret_cast<__half2*>(norm);
-    __half2* res_add_cast = reinterpret_cast<__half2*>(res_add);
-    __half2* residual_cast = reinterpret_cast<__half2*>(residual);
-    const __half2* bias_cast = reinterpret_cast<const __half2*>(bias);
-
-    int k = 0;
-    int input_id = id;
-
-    float sum = 0;
-    while (input_id < row_stride) {
-        inp_reg[k] = vals_cast[input_id + row * row_stride];
-        float2 inp_f = __half22float2(inp_reg[k]);
-        float2 res_f = __half22float2(residual_cast[input_id + row * row_stride]);
-        float2 bias_f = __half22float2(bias_cast[input_id]);
-        if (mlp_after_attn) {
-            inp_f.x += res_f.x + bias_f.x;
-            inp_f.y += res_f.y + bias_f.y;
-        }
-        inp_reg[k] = __float22half2_rn(inp_f);
-        // if (preLN) res_add_cast[input_id + row * row_stride] = __float22half2_rn(res_f);
-        // //inp_reg[k];
-        sum += inp_f.x + inp_f.y;
-        input_id += iteration_stride;
-        k++;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    __shared__ float shr[MAX_WARP_NUM];
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride << 1);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        float2 inp_f = __half22float2(inp_reg[f]);
-        inp_f.x -= mean;
-        inp_f.y -= mean;
-        inp_reg[f] = __float22half2_rn(inp_f);
-        sum += inp_f.x * inp_f.x;
-        sum += inp_f.y * inp_f.y;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride << 1);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-    __half2 variance_h = __float2half2_rn(sum);
-    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
-    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * variance_h;
-        inp_reg[f] = inp_reg[f] * gamma_cast[out_id] + beta_cast[out_id];
-        norm_cast[out_id + row * row_stride] = inp_reg[f];
-    }
-#endif
-}
-
-template <typename T>
-void launch_residual_layer_norm(T* norm,
-                                T* res_add,
-                                T* vals,
-                                T* residual,
-                                const T* bias,
-                                const T* gamma,
-                                const T* beta,
-                                float epsilon,
-                                int batch_size,
-                                int hidden_dim,
-                                bool preLN,
-                                bool mlp_after_attn,
-                                cudaStream_t stream);
-
-template <>
-void launch_residual_layer_norm<float>(float* norm,
-                                       float* res_add,
-                                       float* vals,
-                                       float* residual,
-                                       const float* bias,
-                                       const float* gamma,
-                                       const float* beta,
-                                       float epsilon,
-                                       int batch_size,
-                                       int hidden_dim,
-                                       bool preLN,
-                                       bool mlp_after_attn,
-                                       cudaStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-
-    dim3 block_dim(threads);
-
-    fused_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(norm,
-                                                                  res_add,
-                                                                  vals,
-                                                                  residual,
-                                                                  bias,
-                                                                  gamma,
-                                                                  beta,
-                                                                  epsilon,
-                                                                  hidden_dim,
-                                                                  preLN,
-                                                                  mlp_after_attn);
-}
-
-template <>
-void launch_residual_layer_norm<__half>(__half* norm,
-                                        __half* res_add,
-                                        __half* vals,
-                                        __half* residual,
-                                        const __half* bias,
-                                        const __half* gamma,
-                                        const __half* beta,
-                                        float epsilon,
-                                        int batch_size,
-                                        int hidden_dim,
-                                        bool preLN,
-                                        bool mlp_after_attn,
-                                        cudaStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-    dim3 block_dim(threads);
-
-    fused_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(norm,
-                                                                  res_add,
-                                                                  vals,
-                                                                  residual,
-                                                                  bias,
-                                                                  gamma,
-                                                                  beta,
-                                                                  epsilon,
-                                                                  hidden_dim / 2,
-                                                                  preLN,
-                                                                  mlp_after_attn);
-}
diff --git a/deepspeed/ops/csrc/transformer_bak/inference/csrc/normalize.hip b/deepspeed/ops/csrc/transformer_bak/inference/csrc/normalize.hip
deleted file mode 100644
index dc7fa7accbebfb05a9532cf18121e1a8cc4fc052..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/inference/csrc/normalize.hip
+++ /dev/null
@@ -1,453 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include <limits>
-#include "custom_hip_layers.h"
-
-//#include <cuda_profiler_api.h>
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-
-#define NORM_REG (MAX_REGISTERS)
-
-namespace cg = cooperative_groups;
-
-__global__ void fused_bias_residual_layer_norm(float* output,
-                                               const float* vals,
-                                               const float* gamma,
-                                               const float* beta,
-                                               float epsilon,
-                                               int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    float inp_reg[NORM_REG];
-
-    int k = 0;
-    float sum = 0;
-    int input_id = id;
-    while (input_id < row_stride) {
-        inp_reg[k] = vals[input_id + row * row_stride];
-        sum += inp_reg[k++];
-        input_id += iteration_stride;
-    }
-
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-
-    __shared__ float shr[MAX_WARP_NUM];
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-
-    float mean = sum / (row_stride);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        inp_reg[f] -= mean;
-        sum += inp_reg[f] * inp_reg[f];
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * sum;
-        inp_reg[f] = inp_reg[f] * gamma[out_id] + beta[out_id];
-        output[out_id + row * row_stride] = inp_reg[f];
-    }
-}
-
-__global__ void fused_bias_residual_layer_norm(__half* output,
-                                               const __half* vals,
-                                               const __half* gamma,
-                                               const __half* beta,
-                                               float epsilon,
-                                               int row_stride)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    __half2 inp_reg[NORM_REG];
-
-    const __half2* vals_cast = reinterpret_cast<const __half2*>(vals);
-    __half2* out_cast = reinterpret_cast<__half2*>(output);
-
-    int k = 0;
-    int input_id = id;
-    while (input_id < row_stride) {
-        inp_reg[k++] = vals_cast[input_id + row * row_stride];
-        input_id += iteration_stride;
-    }
-    float sum = 0;
-    for (int f = k - 1; f >= 0; f--) {
-        float2 inp_f = __half22float2(inp_reg[f]);
-        sum += inp_f.x + inp_f.y;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    __shared__ float shr[MAX_WARP_NUM];
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride << 1);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        float2 inp_f = __half22float2(inp_reg[f]);
-        inp_f.x -= mean;
-        inp_f.y -= mean;
-        inp_reg[f] = __float22half2_rn(inp_f);
-        sum += inp_f.x * inp_f.x;
-        sum += inp_f.y * inp_f.y;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride << 1);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-    __half2 variance_h = __float2half2_rn(sum);
-    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
-    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * variance_h;
-        inp_reg[f] = inp_reg[f] * gamma_cast[out_id] + beta_cast[out_id];
-        out_cast[out_id + row * row_stride] = inp_reg[f];
-    }
-#endif
-}
-
-template <typename T>
-void launch_layer_norm(T* out,
-                       T* vals,
-                       const T* gamma,
-                       const T* beta,
-                       float epsilon,
-                       int batch_size,
-                       int hidden_dim,
-                       hipStream_t stream);
-
-template <>
-void launch_layer_norm<float>(float* out,
-                              float* vals,
-                              const float* gamma,
-                              const float* beta,
-                              float epsilon,
-                              int batch_size,
-                              int hidden_dim,
-                              hipStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-
-    dim3 block_dim(threads);
-
-   hipLaunchKernelGGL(( fused_bias_residual_layer_norm), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        out, vals, gamma, beta, epsilon, hidden_dim);
-}
-
-template <>
-void launch_layer_norm<__half>(__half* out,
-                               __half* vals,
-                               const __half* gamma,
-                               const __half* beta,
-                               float epsilon,
-                               int batch_size,
-                               int hidden_dim,
-                               hipStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-    dim3 block_dim(threads);
-
-   hipLaunchKernelGGL(( fused_bias_residual_layer_norm), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        out, vals, gamma, beta, epsilon, hidden_dim / 2);
-}
-
-__global__ void fused_residual_layer_norm(float* norm,
-                                          float* res_add,
-                                          float* vals,
-                                          float* residual,
-                                          const float* bias,
-                                          const float* gamma,
-                                          const float* beta,
-                                          float epsilon,
-                                          int row_stride,
-                                          bool preLN,
-                                          bool mlp_after_attn)
-{
-    int iteration_stride = blockDim.x;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    float inp_reg[NORM_REG];
-
-    int k = 0;
-    int input_id = id;
-
-    float sum = 0;
-    while (input_id < row_stride) {
-        inp_reg[k] = vals[input_id + row * row_stride];
-        float res_f = (residual[input_id + row * row_stride]);
-        float bias_f = (bias[input_id]);
-        if (mlp_after_attn) inp_reg[k] += res_f + bias_f;
-        // if (preLN) res_add[input_id + row * row_stride] = inp_reg[k];
-        sum += inp_reg[k++];
-        input_id += iteration_stride;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-
-    __shared__ float shr[MAX_WARP_NUM];
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        inp_reg[f] -= mean;
-        sum += inp_reg[f] * inp_reg[f];
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * sum;
-        inp_reg[f] = inp_reg[f] * gamma[out_id] + beta[out_id];
-        norm[out_id + row * row_stride] = inp_reg[f];
-    }
-}
-
-__global__ void fused_residual_layer_norm(__half* norm,
-                                          __half* res_add,
-                                          __half* vals,
-                                          __half* residual,
-                                          const __half* bias,
-                                          const __half* gamma,
-                                          const __half* beta,
-                                          float epsilon,
-                                          int row_stride,
-                                          bool preLN,
-                                          bool mlp_after_attn)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int iteration_stride = blockDim.x;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    __half2 inp_reg[NORM_REG];
-
-    __half2* vals_cast = reinterpret_cast<__half2*>(vals);
-    __half2* norm_cast = reinterpret_cast<__half2*>(norm);
-    __half2* res_add_cast = reinterpret_cast<__half2*>(res_add);
-    __half2* residual_cast = reinterpret_cast<__half2*>(residual);
-    const __half2* bias_cast = reinterpret_cast<const __half2*>(bias);
-
-    int k = 0;
-    int input_id = id;
-
-    float sum = 0;
-    while (input_id < row_stride) {
-        inp_reg[k] = vals_cast[input_id + row * row_stride];
-        float2 inp_f = __half22float2(inp_reg[k]);
-        float2 res_f = __half22float2(residual_cast[input_id + row * row_stride]);
-        float2 bias_f = __half22float2(bias_cast[input_id]);
-        if (mlp_after_attn) {
-            inp_f.x += res_f.x + bias_f.x;
-            inp_f.y += res_f.y + bias_f.y;
-        }
-        inp_reg[k] = __float22half2_rn(inp_f);
-        // if (preLN) res_add_cast[input_id + row * row_stride] = __float22half2_rn(res_f);
-        // //inp_reg[k];
-        sum += inp_f.x + inp_f.y;
-        input_id += iteration_stride;
-        k++;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    __shared__ float shr[MAX_WARP_NUM];
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride << 1);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        float2 inp_f = __half22float2(inp_reg[f]);
-        inp_f.x -= mean;
-        inp_f.y -= mean;
-        inp_reg[f] = __float22half2_rn(inp_f);
-        sum += inp_f.x * inp_f.x;
-        sum += inp_f.y * inp_f.y;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride << 1);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-    __half2 variance_h = __float2half2_rn(sum);
-    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
-    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * variance_h;
-        inp_reg[f] = inp_reg[f] * gamma_cast[out_id] + beta_cast[out_id];
-        norm_cast[out_id + row * row_stride] = inp_reg[f];
-    }
-#endif
-}
-
-template <typename T>
-void launch_residual_layer_norm(T* norm,
-                                T* res_add,
-                                T* vals,
-                                T* residual,
-                                const T* bias,
-                                const T* gamma,
-                                const T* beta,
-                                float epsilon,
-                                int batch_size,
-                                int hidden_dim,
-                                bool preLN,
-                                bool mlp_after_attn,
-                                hipStream_t stream);
-
-template <>
-void launch_residual_layer_norm<float>(float* norm,
-                                       float* res_add,
-                                       float* vals,
-                                       float* residual,
-                                       const float* bias,
-                                       const float* gamma,
-                                       const float* beta,
-                                       float epsilon,
-                                       int batch_size,
-                                       int hidden_dim,
-                                       bool preLN,
-                                       bool mlp_after_attn,
-                                       hipStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-
-    dim3 block_dim(threads);
-
-   hipLaunchKernelGGL(( fused_residual_layer_norm), dim3(grid_dim), dim3(block_dim), 0, stream, norm,
-                                                                  res_add,
-                                                                  vals,
-                                                                  residual,
-                                                                  bias,
-                                                                  gamma,
-                                                                  beta,
-                                                                  epsilon,
-                                                                  hidden_dim,
-                                                                  preLN,
-                                                                  mlp_after_attn);
-}
-
-template <>
-void launch_residual_layer_norm<__half>(__half* norm,
-                                        __half* res_add,
-                                        __half* vals,
-                                        __half* residual,
-                                        const __half* bias,
-                                        const __half* gamma,
-                                        const __half* beta,
-                                        float epsilon,
-                                        int batch_size,
-                                        int hidden_dim,
-                                        bool preLN,
-                                        bool mlp_after_attn,
-                                        hipStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-    dim3 block_dim(threads);
-
-   hipLaunchKernelGGL(( fused_residual_layer_norm), dim3(grid_dim), dim3(block_dim), 0, stream, norm,
-                                                                  res_add,
-                                                                  vals,
-                                                                  residual,
-                                                                  bias,
-                                                                  gamma,
-                                                                  beta,
-                                                                  epsilon,
-                                                                  hidden_dim / 2,
-                                                                  preLN,
-                                                                  mlp_after_attn);
-}
diff --git a/deepspeed/ops/csrc/transformer_bak/inference/csrc/pt_binding.cpp b/deepspeed/ops/csrc/transformer_bak/inference/csrc/pt_binding.cpp
deleted file mode 100644
index 5432314bb6dd09110963e8f75232e1b9d259e1b3..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/inference/csrc/pt_binding.cpp
+++ /dev/null
@@ -1,911 +0,0 @@
-
-#include <ATen/cuda/CUDAContext.h>
-#include <torch/extension.h>
-#include <vector>
-#include "context.h"
-#include "cublas_wrappers.h"
-#include "custom_cuda_layers.h"
-
-std::array<int, 3> gemm_algos = std::array<int, 3>({99, 99, 99});
-
-#define MAX_OUT_TOKES 10
-
-template <typename T>
-at::Tensor ds_softmax(at::Tensor& attn_scores,
-                      at::Tensor& attn_mask,
-                      bool triangular,
-                      bool recompute,
-                      bool local_attention,
-                      int window_size,
-                      bool async_op)
-{
-    auto attn_scores_c = attn_scores.contiguous();
-    int bsz = attn_scores_c.size(0);
-
-    int seq_len = attn_scores_c.size(1);
-    int len = attn_scores_c.sizes().size();
-    if (len > 3) seq_len = attn_scores_c.size(2);
-
-    int soft_len = attn_scores_c.size(2);
-    if (len > 3) soft_len = attn_scores_c.size(3);
-
-    int heads = 1;
-    if (len > 3) heads = attn_scores_c.size(1);
-
-    launch_attn_softmax_v2((T*)attn_scores_c.data_ptr(),
-                           (attn_mask.sizes().size() > 1 ? (T*)attn_mask.data_ptr() : nullptr),
-                           triangular,
-                           recompute,
-                           local_attention,
-                           window_size,
-                           bsz,
-                           heads,
-                           seq_len,
-                           soft_len,
-                           1.0,
-                           Context::Instance().GetCurrentStream(async_op));
-
-    return attn_scores_c;
-}
-
-template <typename T>
-void allocate_workspace(size_t hidden_dim,
-                        size_t max_seq_len,
-                        size_t batch_size,
-                        size_t head_size = 128)
-{
-    size_t _workSpaceSize = (hidden_dim * batch_size * max_seq_len);
-    Context::Instance().GenWorkSpace(_workSpaceSize * sizeof(T));
-}
-
-template <typename T>
-at::Tensor einsum_sec_sm_ecm(at::Tensor& Q, at::Tensor& W)
-{
-    auto options = at::TensorOptions()
-                       .dtype(Q.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-    T* workspace = (T*)Context::Instance().GetWorkSpace();
-    float alpha = 1;
-    float gemm_beta = 0.0;
-
-    if (!workspace) {
-        allocate_workspace<T>(W.size(1), MAX_OUT_TOKES, Q.size(0));
-        workspace = (T*)Context::Instance().GetWorkSpace();
-    }
-
-    auto O = at::from_blob(workspace, {Q.size(1), Q.size(2), W.size(1)}, options);
-    unsigned m = W.size(1);
-    unsigned n = Q.size(1) * Q.size(2);
-    unsigned k = Q.size(0);
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_T,
-                   m,
-                   n,
-                   k,
-                   &alpha,
-                   &gemm_beta,
-                   (T*)W.data_ptr(),
-                   (T*)Q.data_ptr(),
-                   (T*)O.data_ptr(),
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    return O;
-}
-
-template <typename T>
-void attention_unfused(at::Tensor& prev_key_cont,
-                       at::Tensor& query_cont,
-                       at::Tensor& attn_mask,
-                       at::Tensor& prev_value_cont,
-                       at::Tensor& output,
-                       int& bsz,
-                       int& seq_len,
-                       int& soft_len,
-                       int& heads,
-                       float& norm_factor,
-                       bool triangular,
-                       bool recompute,
-                       bool local_attention,
-                       int window_size)
-{
-    auto options = at::TensorOptions()
-                       .dtype(query_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-    float alpha = norm_factor;
-    float gemm_beta = 0.0;
-    auto attn_score = at::empty({bsz, heads, seq_len, soft_len}, options);
-    int k = prev_value_cont.size(2) / heads;
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
-                                soft_len,
-                                seq_len,
-                                k,
-                                &alpha,
-                                &gemm_beta,
-                                (T*)prev_key_cont.data_ptr(),
-                                (T*)query_cont.data_ptr(),
-                                (T*)attn_score.data_ptr(),
-                                CUBLAS_OP_N,
-                                CUBLAS_OP_N,
-                                soft_len * k,
-                                seq_len * k,
-                                seq_len * soft_len,
-                                bsz * heads,
-                                CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    attn_score = ds_softmax<T>(
-        attn_score, attn_mask, triangular, recompute, local_attention, window_size, false);
-    alpha = 1.0;
-    cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
-                                k,
-                                seq_len,
-                                soft_len,
-                                &alpha,
-                                &gemm_beta,
-                                (T*)prev_value_cont.data_ptr(),
-                                (T*)attn_score.data_ptr(),
-                                (T*)output.data_ptr(),
-                                CUBLAS_OP_N,
-                                CUBLAS_OP_N,
-                                soft_len * k,
-                                seq_len * soft_len,
-                                seq_len * k,
-                                bsz * heads,
-                                CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-}
-
-template <typename T>
-std::vector<at::Tensor> ds_softmax_context(at::Tensor& query,
-                                           at::Tensor& prev_key,
-                                           at::Tensor& new_key,
-                                           at::Tensor& attn_mask,
-                                           at::Tensor& prev_value,
-                                           at::Tensor& new_value,
-                                           int heads,
-                                           float norm_factor,
-                                           bool merging,
-                                           bool triangular,
-                                           bool local_attention,
-                                           int window_size,
-                                           bool no_masking)
-{
-    auto query_cont = query.contiguous();
-    auto prev_key_cont = prev_key.contiguous();
-    auto prev_value_cont = prev_value.contiguous();
-
-    int new_size = (new_value.sizes().size() > 1 ? new_value.size(1) : 0);
-
-    // Attn_Score [ batch Head Sequence-length Softmax-length]
-
-    int bsz = query_cont.size(0);
-    int seq_len = query_cont.size(1);
-    int soft_len = prev_value.size(1);
-
-    auto options = at::TensorOptions()
-                       .dtype(query_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output =
-        at::empty({prev_value.size(0), heads, seq_len, prev_value.size(2) / heads}, options);
-    attention_unfused<T>(prev_key_cont,
-                         query_cont,
-                         attn_mask,  //(no_masking ? nullptr : (T*)attn_mask.data_ptr()),
-                         prev_value_cont,
-                         output,
-                         bsz,
-                         seq_len,
-                         soft_len,
-                         heads,
-                         norm_factor,
-                         (triangular && (new_size == 0)),
-                         (new_size == 0),
-                         local_attention,
-                         window_size);
-
-    return {output, prev_key, prev_value};
-}
-
-template <typename T>
-at::Tensor ds_bias_gelu(at::Tensor& input, at::Tensor& bias)
-{
-    auto input_cont = input.contiguous();
-
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    int intermediate_size = input_cont.size(2);
-
-    launch_bias_gelu((T*)input_cont.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     intermediate_size,
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-    return input_cont;
-}
-
-template <typename T>
-at::Tensor ds_bias_residual(at::Tensor& input, at::Tensor& residual, at::Tensor& bias)
-{
-    auto input_cont = input.contiguous();
-    auto residual_cont = residual.contiguous();
-
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    // launch_bias_residual((T*)input_cont.data_ptr(),
-    //                      (T*)residual_cont.data_ptr(),
-    //                      (T*)bias.data_ptr(),
-    //                      bsz,
-    //                      input_cont.size(2),
-    //                      (bias.size(0) > 1),
-    //                      Context::Instance().GetCurrentStream());
-    return input_cont;
-}
-
-template <typename T>
-at::Tensor ds_layernorm(at::Tensor& input_cont, at::Tensor& gamma, at::Tensor& betta, float epsilon)
-{
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    auto inp_norm = at::empty_like(input_cont);
-    launch_layer_norm((T*)inp_norm.data_ptr(),
-                      (T*)input_cont.data_ptr(),
-                      (T*)gamma.data_ptr(),
-                      (T*)betta.data_ptr(),
-                      epsilon,
-                      bsz,
-                      input_cont.size(2),
-                      Context::Instance().GetCurrentStream());
-    return inp_norm;
-}
-
-template <typename T>
-at::Tensor qkv_unfused_cublas(at::Tensor& output,
-                              at::Tensor& input,
-                              at::Tensor& weight,
-                              at::Tensor& bias,
-                              at::Tensor& gamma,
-                              at::Tensor& beta,
-                              const float epsilon,
-                              bool add_bias)
-{
-    auto inp_norm = ds_layernorm<T>(input, gamma, beta, epsilon);
-
-    // cudaEventRecord(Context::Instance().GetCompEvent(1), Context::Instance().GetCurrentStream());
-
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    int bsz = input.size(0) * input.size(1);
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   weight.size(1),
-                   bsz,
-                   input.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)inp_norm.data_ptr(),
-                   (T*)output.data_ptr(),
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    if (add_bias)
-        launch_bias_add((T*)output.data_ptr(),
-                        (T*)bias.data_ptr(),
-                        weight.size(1),
-                        bsz,
-                        Context::Instance().GetCurrentStream());
-    return inp_norm;
-}
-
-template <typename T>
-std::vector<at::Tensor> ds_qkv_gemm(at::Tensor& input,
-                                    at::Tensor& weight,
-                                    at::Tensor& bias,
-                                    at::Tensor& gamma,
-                                    at::Tensor& beta,
-                                    const float epsilon,
-                                    bool add_bias)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    auto inp_norm =
-        qkv_unfused_cublas<T>(output, input_cont, weight, bias, gamma, beta, epsilon, add_bias);
-
-    return {output, inp_norm};
-}
-
-template <typename T>
-void quantized_gemm(at::Tensor& output,
-                    at::Tensor& input,
-                    at::Tensor& weight,
-                    at::Tensor& qscale,
-                    int groups,
-                    int merge_count)
-{
-    int bsz = input.size(0) * input.size(1);
-    auto options = at::TensorOptions()
-                       .dtype(input.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-    auto weight16 = at::empty({weight.size(0), weight.size(1)}, options);
-
-    launch_dequantize((T*)weight16.data_ptr(),
-                      (int8_t*)weight.data_ptr(),
-                      (float*)qscale.data_ptr(),
-                      weight.size(1),
-                      weight.size(0),
-                      groups,
-                      merge_count,
-                      Context::Instance().GetCurrentStream());
-
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   weight.size(1),
-                   bsz,
-                   input.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight16.data_ptr(),
-                   (T*)input.data_ptr(),
-                   (T*)output.data_ptr(),
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-}
-
-template <typename T>
-at::Tensor ds_qkv_gemm_int8(at::Tensor& input,
-                            at::Tensor& weight,
-                            at::Tensor& bias,
-                            at::Tensor& gamma,
-                            at::Tensor& beta,
-                            const float epsilon,
-                            at::Tensor& q_scale,
-                            int groups,
-                            bool add_bias)
-{
-    int bsz = input.size(0) * input.size(1);
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    auto inp_norm = ds_layernorm<T>(input_cont, gamma, beta, epsilon);
-
-    quantized_gemm<T>(output, inp_norm, weight, q_scale, groups, 0);
-    if (add_bias)
-        launch_bias_add((T*)output.data_ptr(),
-                        (T*)bias.data_ptr(),
-                        weight.size(1),
-                        bsz,
-                        Context::Instance().GetCurrentStream());
-
-    return output;
-}
-
-template <typename T>
-at::Tensor ds_linear_layer(at::Tensor& input, at::Tensor& weight, at::Tensor& bias)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   weight.size(1),
-                   bsz,
-                   input_cont.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)input_cont.data_ptr(),
-                   (T*)output.data_ptr(),
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-
-    launch_bias_add((T*)output.data_ptr(),
-                    (T*)bias.data_ptr(),
-                    weight.size(1),
-                    bsz,
-                    Context::Instance().GetCurrentStream());
-
-    return output;
-}
-
-template <typename T>
-at::Tensor ds_linear_layer_int8(at::Tensor& input,
-                                at::Tensor& weight,
-                                at::Tensor& bias,
-                                at::Tensor& q_scale,
-                                int groups)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    quantized_gemm<T>(output, input_cont, weight, q_scale, groups, 0);
-    launch_bias_add((T*)output.data_ptr(),
-                    (T*)bias.data_ptr(),
-                    weight.size(1),
-                    bsz,
-                    Context::Instance().GetCurrentStream());
-    return output;
-}
-
-template <typename T>
-at::Tensor ds_vector_matmul(at::Tensor& input, at::Tensor& weight, bool async_op)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    cublasSetStream(Context::Instance().GetCublasHandle(),
-                    Context::Instance().GetCurrentStream(async_op));
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   weight.size(1),
-                   bsz,
-                   input_cont.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)input_cont.data_ptr(),
-                   (T*)output.data_ptr(),
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    return output;
-}
-
-template <typename T>
-at::Tensor ds_vector_matmul_int8(at::Tensor& input,
-                                 at::Tensor& weight,
-                                 at::Tensor& q_scale,
-                                 int groups,
-                                 int merge_count)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    quantized_gemm<T>(output, input_cont, weight, q_scale, groups, merge_count);
-    return output;
-}
-
-template <typename T>
-void mlp_unfused_cublas(at::Tensor& output,
-                        at::Tensor& input,
-                        at::Tensor& residual,
-                        at::Tensor& input_bias,
-                        at::Tensor& weight,
-                        at::Tensor& bias,
-                        at::Tensor& gamma,
-                        at::Tensor& beta,
-                        const float epsilon,
-                        bool preLayerNorm,
-                        bool mlp_after_attn)
-{
-    int bsz = input.size(0) * input.size(1);
-    auto inp_norm = at::empty_like(input);
-
-    launch_residual_layer_norm((T*)inp_norm.data_ptr(),
-                               (T*)nullptr,
-                               (T*)input.data_ptr(),
-                               (T*)residual.data_ptr(),
-                               (T*)input_bias.data_ptr(),
-                               (T*)gamma.data_ptr(),
-                               (T*)beta.data_ptr(),
-                               epsilon,
-                               bsz,
-                               input.size(2),
-                               preLayerNorm,
-                               mlp_after_attn,
-                               Context::Instance().GetCurrentStream());
-
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   weight.size(1),
-                   bsz,
-                   input.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)inp_norm.data_ptr(),
-                   (T*)output.data_ptr(),
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    launch_bias_gelu((T*)output.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     weight.size(1),
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-}
-template <typename T>
-at::Tensor ds_mlp_gemm(at::Tensor& input,
-                       at::Tensor& residual,
-                       at::Tensor& input_bias,
-                       at::Tensor& weight,
-                       at::Tensor& bias,
-                       at::Tensor& gamma,
-                       at::Tensor& beta,
-                       const float epsilon,
-                       bool preLayerNorm,
-                       bool mlp_after_attn)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-
-    mlp_unfused_cublas<T>(output,
-                          mlp_after_attn ? input : residual,
-                          residual,
-                          input_bias,
-                          weight,
-                          bias,
-                          gamma,
-                          beta,
-                          epsilon,
-                          preLayerNorm,
-                          mlp_after_attn);
-
-    return output;
-}
-
-template <typename T>
-std::vector<at::Tensor> ds_mlp_gemm_int8(at::Tensor& input,
-                                         at::Tensor& residual,
-                                         at::Tensor& input_bias,
-                                         at::Tensor& weight,
-                                         at::Tensor& bias,
-                                         at::Tensor& gamma,
-                                         at::Tensor& beta,
-                                         const float epsilon,
-                                         at::Tensor& q_scale,
-                                         int groups,
-                                         bool preLayerNorm)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    auto inp_norm = at::empty_like(input_cont);
-
-    auto residual_add = (preLayerNorm ? at::empty_like(input_cont) : inp_norm);
-    // computing the blocking across K dimension
-    // launch_residual_layer_norm((T*)inp_norm.data_ptr(),
-    //                           (T*)residual_add.data_ptr(),
-    //                           (T*)input_cont.data_ptr(),
-    //                           (T*)residual.data_ptr(),
-    //                           (T*)input_bias.data_ptr(),
-    //                           (T*)gamma.data_ptr(),
-    //                           (T*)beta.data_ptr(),
-    //                           epsilon,
-    //                           bsz,
-    //                           input_cont.size(2),
-    //                           preLayerNorm,
-    //                           Context::Instance().GetCurrentStream());
-
-    quantized_gemm<T>(output, inp_norm, weight, q_scale, groups, 0);
-    launch_bias_gelu((T*)output.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     weight.size(1),
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-
-    return {output, residual_add};
-}
-
-template <typename T>
-at::Tensor fused_gemm_gelu(at::Tensor& input,
-                           at::Tensor& weight,
-                           at::Tensor& bias,
-                           at::Tensor& weight_out,
-                           const float epsilon,
-                           bool preLayerNorm,
-                           bool async_op)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto intermediate =
-        at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight_out.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   weight.size(1),
-                   bsz,
-                   input.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)input_cont.data_ptr(),
-                   (T*)intermediate.data_ptr(),
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    launch_bias_gelu((T*)intermediate.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     weight.size(1),
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   weight_out.size(1),
-                   bsz,
-                   intermediate.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight_out.data_ptr(),
-                   (T*)intermediate.data_ptr(),
-                   (T*)output.data_ptr(),
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    // cudaEventRecord(Context::Instance().GetCompEvent(2),
-    //                Context::Instance().GetCurrentStream(true));
-    return output;
-}
-
-void residual_add_bias(at::Tensor& output,
-                       at::Tensor& input,
-                       at::Tensor& attention_output,
-                       at::Tensor& output_b,
-                       at::Tensor& attention_b,
-                       int mp_size,
-                       bool mlp_after_attn)
-{
-    int bsz = input.size(0) * input.size(1);
-    int hidden_size = input.size(2);
-    // cudaStreamWaitEvent(
-    //    Context::Instance().GetCurrentStream(), Context::Instance().GetCompEvent(2), 0);
-    if (input.scalar_type() == at::kFloat)
-        if (mlp_after_attn)
-            launch_bias_residual((float*)input.data_ptr(),
-                                 (float*)output.data_ptr(),
-                                 (float*)attention_output.data_ptr(),
-                                 (float*)output_b.data_ptr(),
-                                 (float*)attention_b.data_ptr(),
-                                 bsz,
-                                 hidden_size,
-                                 mp_size,
-                                 Context::Instance().GetCurrentStream());
-        else
-            launch_gptj_residual_add<float>((float*)input.data_ptr(),
-                                            (float*)output.data_ptr(),
-                                            (float*)attention_output.data_ptr(),
-                                            (float*)output_b.data_ptr(),
-                                            (float*)attention_b.data_ptr(),
-                                            hidden_size,
-                                            bsz,
-                                            mp_size,
-                                            Context::Instance().GetCurrentStream());
-    else if (mlp_after_attn)
-        launch_bias_residual((__half*)input.data_ptr(),
-                             (__half*)output.data_ptr(),
-                             (__half*)attention_output.data_ptr(),
-                             (__half*)output_b.data_ptr(),
-                             (__half*)attention_b.data_ptr(),
-                             bsz,
-                             hidden_size,
-                             mp_size,
-                             Context::Instance().GetCurrentStream());
-    else
-        launch_gptj_residual_add<__half>((__half*)input.data_ptr(),
-                                         (__half*)output.data_ptr(),
-                                         (__half*)attention_output.data_ptr(),
-                                         (__half*)output_b.data_ptr(),
-                                         (__half*)attention_b.data_ptr(),
-                                         hidden_size,
-                                         bsz,
-                                         mp_size,
-                                         Context::Instance().GetCurrentStream());
-}
-
-std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query,
-                                             at::Tensor& key_layer,
-                                             unsigned rotary_dim,
-                                             unsigned offset,
-                                             unsigned num_heads,
-                                             bool rotate_half,
-                                             bool rotate_every_two)
-{
-    auto query_cont = mixed_query.contiguous();
-    auto key_cont = key_layer.contiguous();
-
-    unsigned bsz = mixed_query.size(0);
-    unsigned head_size = mixed_query.size(2) / num_heads;
-    unsigned seq_len = mixed_query.size(1);
-
-    if (mixed_query.scalar_type() == at::kFloat)
-        launch_apply_rotary_pos_emb<float>((float*)query_cont.data_ptr(),
-                                           (float*)key_cont.data_ptr(),
-                                           head_size,
-                                           seq_len,
-                                           rotary_dim,
-                                           offset,
-                                           num_heads,
-                                           bsz,
-                                           rotate_half,
-                                           rotate_every_two,
-                                           Context::Instance().GetCurrentStream());
-    else
-        launch_apply_rotary_pos_emb<__half>((__half*)query_cont.data_ptr(),
-                                            (__half*)key_cont.data_ptr(),
-                                            head_size,
-                                            seq_len,
-                                            rotary_dim,
-                                            offset,
-                                            num_heads,
-                                            bsz,
-                                            rotate_half,
-                                            rotate_every_two,
-                                            Context::Instance().GetCurrentStream());
-    return {query_cont, key_cont};
-}
-
-template <typename T>
-at::Tensor fused_gemm_gelu_int8(at::Tensor& input,
-                                at::Tensor& weight,
-                                at::Tensor& bias,
-                                const float epsilon,
-                                at::Tensor& q_scale,
-                                int groups,
-                                bool preLayerNorm)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    int bsz = input_cont.size(0) * input_cont.size(1);
-
-    quantized_gemm<T>(output, input_cont, weight, q_scale, groups, 0);
-    launch_bias_gelu((T*)output.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     weight.size(1),
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-
-    return output;
-}
-
-at::Tensor moe_res_matmul(at::Tensor& moe_res, at::Tensor& coef, at::Tensor& output)
-{
-    int M = moe_res.size(0) * moe_res.size(1);
-    int N = moe_res.size(2);
-    Context::Instance().SynchComm();
-    if (moe_res.scalar_type() == at::kFloat) {
-        launch_moe_res_matmul<float>((float*)moe_res.data_ptr(),
-                                     (float*)coef.data_ptr(),
-                                     (float*)output.data_ptr(),
-                                     M,
-                                     N,
-                                     at::cuda::getCurrentCUDAStream());
-    } else {
-        launch_moe_res_matmul<__half>((__half*)moe_res.data_ptr(),
-                                      (__half*)coef.data_ptr(),
-                                      (__half*)output.data_ptr(),
-                                      M,
-                                      N,
-                                      at::cuda::getCurrentCUDAStream());
-    }
-    return output;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("softmax_fp32", &ds_softmax<float>, "DeepSpeed SoftMax with fp32 (CUDA)");
-    m.def("softmax_fp16", &ds_softmax<__half>, "DeepSpeed SoftMax with fp32 (CUDA)");
-    m.def(
-        "softmax_context_fp32", &ds_softmax_context<float>, "DeepSpeed attention with fp32 (CUDA)");
-    m.def("softmax_context_fp16",
-          &ds_softmax_context<__half>,
-          "DeepSpeed attention with fp32 (CUDA)");
-    m.def("bias_gelu_fp32", &ds_bias_gelu<float>, "DeepSpeed Gelu with fp32 (CUDA)");
-    m.def("bias_gelu_fp16", &ds_bias_gelu<__half>, "DeepSpeed Gelu with fp32 (CUDA)");
-    m.def("bias_residual_fp32",
-          &ds_bias_residual<float>,
-          "DeepSpeed residual-bias add with fp32 (CUDA)");
-    m.def("bias_residual_fp16",
-          &ds_bias_residual<__half>,
-          "DeepSpeed residual-bias add with fp32 (CUDA)");
-    m.def("layer_norm_fp32", &ds_layernorm<float>, "DeepSpeed layer-norm with fp32 (CUDA)");
-    m.def("layer_norm_fp16", &ds_layernorm<__half>, "DeepSpeed layer-norm with fp16 (CUDA)");
-    m.def("qkv_gemm_fp32", &ds_qkv_gemm<float>, "DeepSpeed qkv gemm with fp32 (CUDA)");
-    m.def("qkv_gemm_fp16", &ds_qkv_gemm<__half>, "DeepSpeed qkv gemm with fp16 (CUDA)");
-    m.def("qkv_gemm_int8", &ds_qkv_gemm_int8<__half>, "DeepSpeed qkv gemm with int8 (CUDA)");
-    m.def("mlp_gemm_fp32", &ds_mlp_gemm<float>, "DeepSpeed mlp with fp32 (CUDA)");
-    m.def("mlp_gemm_fp16", &ds_mlp_gemm<__half>, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("mlp_gemm_int8", &ds_mlp_gemm_int8<__half>, "DeepSpeed mlp with int8 (CUDA)");
-    m.def("vector_matmul_fp32", &ds_vector_matmul<float>, "DeepSpeed vector-MM with fp32 (CUDA)");
-    m.def("vector_matmul_fp16", &ds_vector_matmul<__half>, "DeepSpeed vector-MM with fp16 (CUDA)");
-    m.def("vector_matmul_int8",
-          &ds_vector_matmul_int8<__half>,
-          "DeepSpeed vector-MM with int8 (CUDA)");
-    m.def("linear_layer_fp32", &ds_linear_layer<float>, "DeepSpeed linear_layer with fp32 (CUDA)");
-    m.def("linear_layer_fp16", &ds_linear_layer<__half>, "DeepSpeed linear_layer with fp16 (CUDA)");
-    m.def("linear_layer_int8",
-          &ds_linear_layer_int8<__half>,
-          "DeepSpeed linear_layer with int8 (CUDA)");
-    m.def("fused_gemm_gelu_fp32", &fused_gemm_gelu<float>, "DeepSpeed mlp with fp32 (CUDA)");
-    m.def("fused_gemm_gelu_fp16", &fused_gemm_gelu<__half>, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("residual_add", &residual_add_bias, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("apply_rotary_pos_emb", &apply_rotary_pos_emb, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("einsum_sec_sm_ecm_fp32",
-          &einsum_sec_sm_ecm<float>,
-          "DeepSpeed vector-MM with fp32 (CUDA)");
-
-    m.def("einsum_sec_sm_ecm_fp16",
-          &einsum_sec_sm_ecm<__half>,
-          "DeepSpeed vector-MM with fp16 (CUDA)");
-    m.def("moe_res_matmul", &moe_res_matmul, "DeepSpeed moe residual matmul (CUDA)");
-}
diff --git a/deepspeed/ops/csrc/transformer_bak/inference/csrc/pt_binding_hip.cpp b/deepspeed/ops/csrc/transformer_bak/inference/csrc/pt_binding_hip.cpp
deleted file mode 100644
index 009951db340bcdaa2e3c0d806782dbca89dfdf76..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/inference/csrc/pt_binding_hip.cpp
+++ /dev/null
@@ -1,912 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-
-#include <ATen/hip/HIPContext.h>
-#include <torch/extension.h>
-#include <vector>
-#include "context_hip.h"
-#include "cublas_wrappers_hip.h"
-#include "custom_hip_layers.h"
-
-std::array<int, 3> gemm_algos = std::array<int, 3>({99, 99, 99});
-
-#define MAX_OUT_TOKES 10
-
-template <typename T>
-at::Tensor ds_softmax(at::Tensor& attn_scores,
-                      at::Tensor& attn_mask,
-                      bool triangular,
-                      bool recompute,
-                      bool local_attention,
-                      int window_size,
-                      bool async_op)
-{
-    auto attn_scores_c = attn_scores.contiguous();
-    int bsz = attn_scores_c.size(0);
-
-    int seq_len = attn_scores_c.size(1);
-    int len = attn_scores_c.sizes().size();
-    if (len > 3) seq_len = attn_scores_c.size(2);
-
-    int soft_len = attn_scores_c.size(2);
-    if (len > 3) soft_len = attn_scores_c.size(3);
-
-    int heads = 1;
-    if (len > 3) heads = attn_scores_c.size(1);
-
-    launch_attn_softmax_v2((T*)attn_scores_c.data_ptr(),
-                           (attn_mask.sizes().size() > 1 ? (T*)attn_mask.data_ptr() : nullptr),
-                           triangular,
-                           recompute,
-                           local_attention,
-                           window_size,
-                           bsz,
-                           heads,
-                           seq_len,
-                           soft_len,
-                           1.0,
-                           Context::Instance().GetCurrentStream(async_op));
-
-    return attn_scores_c;
-}
-
-template <typename T>
-void allocate_workspace(size_t hidden_dim,
-                        size_t max_seq_len,
-                        size_t batch_size,
-                        size_t head_size = 128)
-{
-    size_t _workSpaceSize = (hidden_dim * batch_size * max_seq_len);
-    Context::Instance().GenWorkSpace(_workSpaceSize * sizeof(T));
-}
-
-template <typename T>
-at::Tensor einsum_sec_sm_ecm(at::Tensor& Q, at::Tensor& W)
-{
-    auto options = at::TensorOptions()
-                       .dtype(Q.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-    T* workspace = (T*)Context::Instance().GetWorkSpace();
-    float alpha = 1;
-    float gemm_beta = 0.0;
-
-    if (!workspace) {
-        allocate_workspace<T>(W.size(1), MAX_OUT_TOKES, Q.size(0));
-        workspace = (T*)Context::Instance().GetWorkSpace();
-    }
-
-    auto O = at::from_blob(workspace, {Q.size(1), Q.size(2), W.size(1)}, options);
-    unsigned m = W.size(1);
-    unsigned n = Q.size(1) * Q.size(2);
-    unsigned k = Q.size(0);
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   rocblas_operation_none,
-                   rocblas_operation_transpose,
-                   m,
-                   n,
-                   k,
-                   &alpha,
-                   &gemm_beta,
-                   (T*)W.data_ptr(),
-                   (T*)Q.data_ptr(),
-                   (T*)O.data_ptr(),
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    return O;
-}
-
-template <typename T>
-void attention_unfused(at::Tensor& prev_key_cont,
-                       at::Tensor& query_cont,
-                       at::Tensor& attn_mask,
-                       at::Tensor& prev_value_cont,
-                       at::Tensor& output,
-                       int& bsz,
-                       int& seq_len,
-                       int& soft_len,
-                       int& heads,
-                       float& norm_factor,
-                       bool triangular,
-                       bool recompute,
-                       bool local_attention,
-                       int window_size)
-{
-    auto options = at::TensorOptions()
-                       .dtype(query_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-    float alpha = norm_factor;
-    float gemm_beta = 0.0;
-    auto attn_score = at::empty({bsz, heads, seq_len, soft_len}, options);
-    int k = prev_value_cont.size(2) / heads;
-    rocblas_set_stream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
-                                soft_len,
-                                seq_len,
-                                k,
-                                &alpha,
-                                &gemm_beta,
-                                (T*)prev_key_cont.data_ptr(),
-                                (T*)query_cont.data_ptr(),
-                                (T*)attn_score.data_ptr(),
-                                rocblas_operation_none,
-                                rocblas_operation_none,
-                                soft_len * k,
-                                seq_len * k,
-                                seq_len * soft_len,
-                                bsz * heads,
-                                CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    attn_score = ds_softmax<T>(
-        attn_score, attn_mask, triangular, recompute, local_attention, window_size, false);
-    alpha = 1.0;
-    cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
-                                k,
-                                seq_len,
-                                soft_len,
-                                &alpha,
-                                &gemm_beta,
-                                (T*)prev_value_cont.data_ptr(),
-                                (T*)attn_score.data_ptr(),
-                                (T*)output.data_ptr(),
-                                rocblas_operation_none,
-                                rocblas_operation_none,
-                                soft_len * k,
-                                seq_len * soft_len,
-                                seq_len * k,
-                                bsz * heads,
-                                CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-}
-
-template <typename T>
-std::vector<at::Tensor> ds_softmax_context(at::Tensor& query,
-                                           at::Tensor& prev_key,
-                                           at::Tensor& new_key,
-                                           at::Tensor& attn_mask,
-                                           at::Tensor& prev_value,
-                                           at::Tensor& new_value,
-                                           int heads,
-                                           float norm_factor,
-                                           bool merging,
-                                           bool triangular,
-                                           bool local_attention,
-                                           int window_size,
-                                           bool no_masking)
-{
-    auto query_cont = query.contiguous();
-    auto prev_key_cont = prev_key.contiguous();
-    auto prev_value_cont = prev_value.contiguous();
-
-    int new_size = (new_value.sizes().size() > 1 ? new_value.size(1) : 0);
-
-    // Attn_Score [ batch Head Sequence-length Softmax-length]
-
-    int bsz = query_cont.size(0);
-    int seq_len = query_cont.size(1);
-    int soft_len = prev_value.size(1);
-
-    auto options = at::TensorOptions()
-                       .dtype(query_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output =
-        at::empty({prev_value.size(0), heads, seq_len, prev_value.size(2) / heads}, options);
-    attention_unfused<T>(prev_key_cont,
-                         query_cont,
-                         attn_mask,  //(no_masking ? nullptr : (T*)attn_mask.data_ptr()),
-                         prev_value_cont,
-                         output,
-                         bsz,
-                         seq_len,
-                         soft_len,
-                         heads,
-                         norm_factor,
-                         (triangular && (new_size == 0)),
-                         (new_size == 0),
-                         local_attention,
-                         window_size);
-
-    return {output, prev_key, prev_value};
-}
-
-template <typename T>
-at::Tensor ds_bias_gelu(at::Tensor& input, at::Tensor& bias)
-{
-    auto input_cont = input.contiguous();
-
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    int intermediate_size = input_cont.size(2);
-
-    launch_bias_gelu((T*)input_cont.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     intermediate_size,
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-    return input_cont;
-}
-
-template <typename T>
-at::Tensor ds_bias_residual(at::Tensor& input, at::Tensor& residual, at::Tensor& bias)
-{
-    auto input_cont = input.contiguous();
-    auto residual_cont = residual.contiguous();
-
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    // launch_bias_residual((T*)input_cont.data_ptr(),
-    //                      (T*)residual_cont.data_ptr(),
-    //                      (T*)bias.data_ptr(),
-    //                      bsz,
-    //                      input_cont.size(2),
-    //                      (bias.size(0) > 1),
-    //                      Context::Instance().GetCurrentStream());
-    return input_cont;
-}
-
-template <typename T>
-at::Tensor ds_layernorm(at::Tensor& input_cont, at::Tensor& gamma, at::Tensor& betta, float epsilon)
-{
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    auto inp_norm = at::empty_like(input_cont);
-    launch_layer_norm((T*)inp_norm.data_ptr(),
-                      (T*)input_cont.data_ptr(),
-                      (T*)gamma.data_ptr(),
-                      (T*)betta.data_ptr(),
-                      epsilon,
-                      bsz,
-                      input_cont.size(2),
-                      Context::Instance().GetCurrentStream());
-    return inp_norm;
-}
-
-template <typename T>
-at::Tensor qkv_unfused_cublas(at::Tensor& output,
-                              at::Tensor& input,
-                              at::Tensor& weight,
-                              at::Tensor& bias,
-                              at::Tensor& gamma,
-                              at::Tensor& beta,
-                              const float epsilon,
-                              bool add_bias)
-{
-    auto inp_norm = ds_layernorm<T>(input, gamma, beta, epsilon);
-
-    // hipEventRecord(Context::Instance().GetCompEvent(1), Context::Instance().GetCurrentStream());
-
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    int bsz = input.size(0) * input.size(1);
-    rocblas_set_stream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   rocblas_operation_none,
-                   rocblas_operation_none,
-                   weight.size(1),
-                   bsz,
-                   input.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)inp_norm.data_ptr(),
-                   (T*)output.data_ptr(),
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    if (add_bias)
-        launch_bias_add((T*)output.data_ptr(),
-                        (T*)bias.data_ptr(),
-                        weight.size(1),
-                        bsz,
-                        Context::Instance().GetCurrentStream());
-    return inp_norm;
-}
-
-template <typename T>
-std::vector<at::Tensor> ds_qkv_gemm(at::Tensor& input,
-                                    at::Tensor& weight,
-                                    at::Tensor& bias,
-                                    at::Tensor& gamma,
-                                    at::Tensor& beta,
-                                    const float epsilon,
-                                    bool add_bias)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    auto inp_norm =
-        qkv_unfused_cublas<T>(output, input_cont, weight, bias, gamma, beta, epsilon, add_bias);
-
-    return {output, inp_norm};
-}
-
-template <typename T>
-void quantized_gemm(at::Tensor& output,
-                    at::Tensor& input,
-                    at::Tensor& weight,
-                    at::Tensor& qscale,
-                    int groups,
-                    int merge_count)
-{
-    int bsz = input.size(0) * input.size(1);
-    auto options = at::TensorOptions()
-                       .dtype(input.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-    auto weight16 = at::empty({weight.size(0), weight.size(1)}, options);
-
-    launch_dequantize((T*)weight16.data_ptr(),
-                      (int8_t*)weight.data_ptr(),
-                      (float*)qscale.data_ptr(),
-                      weight.size(1),
-                      weight.size(0),
-                      groups,
-                      merge_count,
-                      Context::Instance().GetCurrentStream());
-
-    rocblas_set_stream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   rocblas_operation_none,
-                   rocblas_operation_none,
-                   weight.size(1),
-                   bsz,
-                   input.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight16.data_ptr(),
-                   (T*)input.data_ptr(),
-                   (T*)output.data_ptr(),
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-}
-
-template <typename T>
-at::Tensor ds_qkv_gemm_int8(at::Tensor& input,
-                            at::Tensor& weight,
-                            at::Tensor& bias,
-                            at::Tensor& gamma,
-                            at::Tensor& beta,
-                            const float epsilon,
-                            at::Tensor& q_scale,
-                            int groups,
-                            bool add_bias)
-{
-    int bsz = input.size(0) * input.size(1);
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    auto inp_norm = ds_layernorm<T>(input_cont, gamma, beta, epsilon);
-
-    quantized_gemm<T>(output, inp_norm, weight, q_scale, groups, 0);
-    if (add_bias)
-        launch_bias_add((T*)output.data_ptr(),
-                        (T*)bias.data_ptr(),
-                        weight.size(1),
-                        bsz,
-                        Context::Instance().GetCurrentStream());
-
-    return output;
-}
-
-template <typename T>
-at::Tensor ds_linear_layer(at::Tensor& input, at::Tensor& weight, at::Tensor& bias)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    rocblas_set_stream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   rocblas_operation_none,
-                   rocblas_operation_none,
-                   weight.size(1),
-                   bsz,
-                   input_cont.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)input_cont.data_ptr(),
-                   (T*)output.data_ptr(),
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-
-    launch_bias_add((T*)output.data_ptr(),
-                    (T*)bias.data_ptr(),
-                    weight.size(1),
-                    bsz,
-                    Context::Instance().GetCurrentStream());
-
-    return output;
-}
-
-template <typename T>
-at::Tensor ds_linear_layer_int8(at::Tensor& input,
-                                at::Tensor& weight,
-                                at::Tensor& bias,
-                                at::Tensor& q_scale,
-                                int groups)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    quantized_gemm<T>(output, input_cont, weight, q_scale, groups, 0);
-    launch_bias_add((T*)output.data_ptr(),
-                    (T*)bias.data_ptr(),
-                    weight.size(1),
-                    bsz,
-                    Context::Instance().GetCurrentStream());
-    return output;
-}
-
-template <typename T>
-at::Tensor ds_vector_matmul(at::Tensor& input, at::Tensor& weight, bool async_op)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    rocblas_set_stream(Context::Instance().GetCublasHandle(),
-                    Context::Instance().GetCurrentStream(async_op));
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   rocblas_operation_none,
-                   rocblas_operation_none,
-                   weight.size(1),
-                   bsz,
-                   input_cont.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)input_cont.data_ptr(),
-                   (T*)output.data_ptr(),
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    return output;
-}
-
-template <typename T>
-at::Tensor ds_vector_matmul_int8(at::Tensor& input,
-                                 at::Tensor& weight,
-                                 at::Tensor& q_scale,
-                                 int groups,
-                                 int merge_count)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    quantized_gemm<T>(output, input_cont, weight, q_scale, groups, merge_count);
-    return output;
-}
-
-template <typename T>
-void mlp_unfused_cublas(at::Tensor& output,
-                        at::Tensor& input,
-                        at::Tensor& residual,
-                        at::Tensor& input_bias,
-                        at::Tensor& weight,
-                        at::Tensor& bias,
-                        at::Tensor& gamma,
-                        at::Tensor& beta,
-                        const float epsilon,
-                        bool preLayerNorm,
-                        bool mlp_after_attn)
-{
-    int bsz = input.size(0) * input.size(1);
-    auto inp_norm = at::empty_like(input);
-
-    launch_residual_layer_norm((T*)inp_norm.data_ptr(),
-                               (T*)nullptr,
-                               (T*)input.data_ptr(),
-                               (T*)residual.data_ptr(),
-                               (T*)input_bias.data_ptr(),
-                               (T*)gamma.data_ptr(),
-                               (T*)beta.data_ptr(),
-                               epsilon,
-                               bsz,
-                               input.size(2),
-                               preLayerNorm,
-                               mlp_after_attn,
-                               Context::Instance().GetCurrentStream());
-
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    rocblas_set_stream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   rocblas_operation_none,
-                   rocblas_operation_none,
-                   weight.size(1),
-                   bsz,
-                   input.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)inp_norm.data_ptr(),
-                   (T*)output.data_ptr(),
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    launch_bias_gelu((T*)output.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     weight.size(1),
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-}
-template <typename T>
-at::Tensor ds_mlp_gemm(at::Tensor& input,
-                       at::Tensor& residual,
-                       at::Tensor& input_bias,
-                       at::Tensor& weight,
-                       at::Tensor& bias,
-                       at::Tensor& gamma,
-                       at::Tensor& beta,
-                       const float epsilon,
-                       bool preLayerNorm,
-                       bool mlp_after_attn)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-
-    mlp_unfused_cublas<T>(output,
-                          mlp_after_attn ? input : residual,
-                          residual,
-                          input_bias,
-                          weight,
-                          bias,
-                          gamma,
-                          beta,
-                          epsilon,
-                          preLayerNorm,
-                          mlp_after_attn);
-
-    return output;
-}
-
-template <typename T>
-std::vector<at::Tensor> ds_mlp_gemm_int8(at::Tensor& input,
-                                         at::Tensor& residual,
-                                         at::Tensor& input_bias,
-                                         at::Tensor& weight,
-                                         at::Tensor& bias,
-                                         at::Tensor& gamma,
-                                         at::Tensor& beta,
-                                         const float epsilon,
-                                         at::Tensor& q_scale,
-                                         int groups,
-                                         bool preLayerNorm)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    auto inp_norm = at::empty_like(input_cont);
-
-    auto residual_add = (preLayerNorm ? at::empty_like(input_cont) : inp_norm);
-    // computing the blocking across K dimension
-    // launch_residual_layer_norm((T*)inp_norm.data_ptr(),
-    //                           (T*)residual_add.data_ptr(),
-    //                           (T*)input_cont.data_ptr(),
-    //                           (T*)residual.data_ptr(),
-    //                           (T*)input_bias.data_ptr(),
-    //                           (T*)gamma.data_ptr(),
-    //                           (T*)beta.data_ptr(),
-    //                           epsilon,
-    //                           bsz,
-    //                           input_cont.size(2),
-    //                           preLayerNorm,
-    //                           Context::Instance().GetCurrentStream());
-
-    quantized_gemm<T>(output, inp_norm, weight, q_scale, groups, 0);
-    launch_bias_gelu((T*)output.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     weight.size(1),
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-
-    return {output, residual_add};
-}
-
-template <typename T>
-at::Tensor fused_gemm_gelu(at::Tensor& input,
-                           at::Tensor& weight,
-                           at::Tensor& bias,
-                           at::Tensor& weight_out,
-                           const float epsilon,
-                           bool preLayerNorm,
-                           bool async_op)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto intermediate =
-        at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight_out.size(1)}, options);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    float alpha = (T)1.0;
-    float gemm_beta = (T)0.0;
-    rocblas_set_stream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   rocblas_operation_none,
-                   rocblas_operation_none,
-                   weight.size(1),
-                   bsz,
-                   input.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight.data_ptr(),
-                   (T*)input_cont.data_ptr(),
-                   (T*)intermediate.data_ptr(),
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    launch_bias_gelu((T*)intermediate.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     weight.size(1),
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   rocblas_operation_none,
-                   rocblas_operation_none,
-                   weight_out.size(1),
-                   bsz,
-                   intermediate.size(2),
-                   &alpha,
-                   &gemm_beta,
-                   (T*)weight_out.data_ptr(),
-                   (T*)intermediate.data_ptr(),
-                   (T*)output.data_ptr(),
-                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    // hipEventRecord(Context::Instance().GetCompEvent(2),
-    //                Context::Instance().GetCurrentStream(true));
-    return output;
-}
-
-void residual_add_bias(at::Tensor& output,
-                       at::Tensor& input,
-                       at::Tensor& attention_output,
-                       at::Tensor& output_b,
-                       at::Tensor& attention_b,
-                       int mp_size,
-                       bool mlp_after_attn)
-{
-    int bsz = input.size(0) * input.size(1);
-    int hidden_size = input.size(2);
-    // hipStreamWaitEvent(
-    //    Context::Instance().GetCurrentStream(), Context::Instance().GetCompEvent(2), 0);
-    if (input.scalar_type() == at::kFloat)
-        if (mlp_after_attn)
-            launch_bias_residual((float*)input.data_ptr(),
-                                 (float*)output.data_ptr(),
-                                 (float*)attention_output.data_ptr(),
-                                 (float*)output_b.data_ptr(),
-                                 (float*)attention_b.data_ptr(),
-                                 bsz,
-                                 hidden_size,
-                                 mp_size,
-                                 Context::Instance().GetCurrentStream());
-        else
-            launch_gptj_residual_add<float>((float*)input.data_ptr(),
-                                            (float*)output.data_ptr(),
-                                            (float*)attention_output.data_ptr(),
-                                            (float*)output_b.data_ptr(),
-                                            (float*)attention_b.data_ptr(),
-                                            hidden_size,
-                                            bsz,
-                                            mp_size,
-                                            Context::Instance().GetCurrentStream());
-    else if (mlp_after_attn)
-        launch_bias_residual((__half*)input.data_ptr(),
-                             (__half*)output.data_ptr(),
-                             (__half*)attention_output.data_ptr(),
-                             (__half*)output_b.data_ptr(),
-                             (__half*)attention_b.data_ptr(),
-                             bsz,
-                             hidden_size,
-                             mp_size,
-                             Context::Instance().GetCurrentStream());
-    else
-        launch_gptj_residual_add<__half>((__half*)input.data_ptr(),
-                                         (__half*)output.data_ptr(),
-                                         (__half*)attention_output.data_ptr(),
-                                         (__half*)output_b.data_ptr(),
-                                         (__half*)attention_b.data_ptr(),
-                                         hidden_size,
-                                         bsz,
-                                         mp_size,
-                                         Context::Instance().GetCurrentStream());
-}
-
-std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query,
-                                             at::Tensor& key_layer,
-                                             unsigned rotary_dim,
-                                             unsigned offset,
-                                             unsigned num_heads,
-                                             bool rotate_half,
-                                             bool rotate_every_two)
-{
-    auto query_cont = mixed_query.contiguous();
-    auto key_cont = key_layer.contiguous();
-
-    unsigned bsz = mixed_query.size(0);
-    unsigned head_size = mixed_query.size(2) / num_heads;
-    unsigned seq_len = mixed_query.size(1);
-
-    if (mixed_query.scalar_type() == at::kFloat)
-        launch_apply_rotary_pos_emb<float>((float*)query_cont.data_ptr(),
-                                           (float*)key_cont.data_ptr(),
-                                           head_size,
-                                           seq_len,
-                                           rotary_dim,
-                                           offset,
-                                           num_heads,
-                                           bsz,
-                                           rotate_half,
-                                           rotate_every_two,
-                                           Context::Instance().GetCurrentStream());
-    else
-        launch_apply_rotary_pos_emb<__half>((__half*)query_cont.data_ptr(),
-                                            (__half*)key_cont.data_ptr(),
-                                            head_size,
-                                            seq_len,
-                                            rotary_dim,
-                                            offset,
-                                            num_heads,
-                                            bsz,
-                                            rotate_half,
-                                            rotate_every_two,
-                                            Context::Instance().GetCurrentStream());
-    return {query_cont, key_cont};
-}
-
-template <typename T>
-at::Tensor fused_gemm_gelu_int8(at::Tensor& input,
-                                at::Tensor& weight,
-                                at::Tensor& bias,
-                                const float epsilon,
-                                at::Tensor& q_scale,
-                                int groups,
-                                bool preLayerNorm)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    int bsz = input_cont.size(0) * input_cont.size(1);
-
-    quantized_gemm<T>(output, input_cont, weight, q_scale, groups, 0);
-    launch_bias_gelu((T*)output.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     weight.size(1),
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-
-    return output;
-}
-
-at::Tensor moe_res_matmul(at::Tensor& moe_res, at::Tensor& coef, at::Tensor& output)
-{
-    int M = moe_res.size(0) * moe_res.size(1);
-    int N = moe_res.size(2);
-    Context::Instance().SynchComm();
-    if (moe_res.scalar_type() == at::kFloat) {
-        launch_moe_res_matmul<float>((float*)moe_res.data_ptr(),
-                                     (float*)coef.data_ptr(),
-                                     (float*)output.data_ptr(),
-                                     M,
-                                     N,
-                                     at::hip::getCurrentHIPStreamMasqueradingAsCUDA());
-    } else {
-        launch_moe_res_matmul<__half>((__half*)moe_res.data_ptr(),
-                                      (__half*)coef.data_ptr(),
-                                      (__half*)output.data_ptr(),
-                                      M,
-                                      N,
-                                      at::hip::getCurrentHIPStreamMasqueradingAsCUDA());
-    }
-    return output;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("softmax_fp32", &ds_softmax<float>, "DeepSpeed SoftMax with fp32 (CUDA)");
-    m.def("softmax_fp16", &ds_softmax<__half>, "DeepSpeed SoftMax with fp32 (CUDA)");
-    m.def(
-        "softmax_context_fp32", &ds_softmax_context<float>, "DeepSpeed attention with fp32 (CUDA)");
-    m.def("softmax_context_fp16",
-          &ds_softmax_context<__half>,
-          "DeepSpeed attention with fp32 (CUDA)");
-    m.def("bias_gelu_fp32", &ds_bias_gelu<float>, "DeepSpeed Gelu with fp32 (CUDA)");
-    m.def("bias_gelu_fp16", &ds_bias_gelu<__half>, "DeepSpeed Gelu with fp32 (CUDA)");
-    m.def("bias_residual_fp32",
-          &ds_bias_residual<float>,
-          "DeepSpeed residual-bias add with fp32 (CUDA)");
-    m.def("bias_residual_fp16",
-          &ds_bias_residual<__half>,
-          "DeepSpeed residual-bias add with fp32 (CUDA)");
-    m.def("layer_norm_fp32", &ds_layernorm<float>, "DeepSpeed layer-norm with fp32 (CUDA)");
-    m.def("layer_norm_fp16", &ds_layernorm<__half>, "DeepSpeed layer-norm with fp16 (CUDA)");
-    m.def("qkv_gemm_fp32", &ds_qkv_gemm<float>, "DeepSpeed qkv gemm with fp32 (CUDA)");
-    m.def("qkv_gemm_fp16", &ds_qkv_gemm<__half>, "DeepSpeed qkv gemm with fp16 (CUDA)");
-    m.def("qkv_gemm_int8", &ds_qkv_gemm_int8<__half>, "DeepSpeed qkv gemm with int8 (CUDA)");
-    m.def("mlp_gemm_fp32", &ds_mlp_gemm<float>, "DeepSpeed mlp with fp32 (CUDA)");
-    m.def("mlp_gemm_fp16", &ds_mlp_gemm<__half>, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("mlp_gemm_int8", &ds_mlp_gemm_int8<__half>, "DeepSpeed mlp with int8 (CUDA)");
-    m.def("vector_matmul_fp32", &ds_vector_matmul<float>, "DeepSpeed vector-MM with fp32 (CUDA)");
-    m.def("vector_matmul_fp16", &ds_vector_matmul<__half>, "DeepSpeed vector-MM with fp16 (CUDA)");
-    m.def("vector_matmul_int8",
-          &ds_vector_matmul_int8<__half>,
-          "DeepSpeed vector-MM with int8 (CUDA)");
-    m.def("linear_layer_fp32", &ds_linear_layer<float>, "DeepSpeed linear_layer with fp32 (CUDA)");
-    m.def("linear_layer_fp16", &ds_linear_layer<__half>, "DeepSpeed linear_layer with fp16 (CUDA)");
-    m.def("linear_layer_int8",
-          &ds_linear_layer_int8<__half>,
-          "DeepSpeed linear_layer with int8 (CUDA)");
-    m.def("fused_gemm_gelu_fp32", &fused_gemm_gelu<float>, "DeepSpeed mlp with fp32 (CUDA)");
-    m.def("fused_gemm_gelu_fp16", &fused_gemm_gelu<__half>, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("residual_add", &residual_add_bias, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("apply_rotary_pos_emb", &apply_rotary_pos_emb, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("einsum_sec_sm_ecm_fp32",
-          &einsum_sec_sm_ecm<float>,
-          "DeepSpeed vector-MM with fp32 (CUDA)");
-
-    m.def("einsum_sec_sm_ecm_fp16",
-          &einsum_sec_sm_ecm<__half>,
-          "DeepSpeed vector-MM with fp16 (CUDA)");
-    m.def("moe_res_matmul", &moe_res_matmul, "DeepSpeed moe residual matmul (CUDA)");
-}
diff --git a/deepspeed/ops/csrc/transformer_bak/inference/csrc/softmax.cu b/deepspeed/ops/csrc/transformer_bak/inference/csrc/softmax.cu
deleted file mode 100644
index 788de78bb1d836d274c4cef4e22541acb3bc2dd4..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/inference/csrc/softmax.cu
+++ /dev/null
@@ -1,432 +0,0 @@
-#include <limits>
-#include "custom_cuda_layers.h"
-
-//#include <cuda_profiler_api.h>
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-
-#define ATTN_THREADS 1024
-#define MAX_REG_SIZE 8
-
-#define minus_infinity -10000.0
-
-void CheckCudaErrorAux(const char* file, unsigned line)
-{
-    cudaError_t err = cudaGetLastError();
-    if (err == cudaSuccess) return;
-    std::cerr << cudaGetErrorString(err) << "(" << err << ") at " << file << ":" << line
-              << std::endl;
-    throw std::runtime_error("CUDA ERROR!!!\n");
-}
-
-#define CUDA_CHECK_ERROR() CheckCudaErrorAux(__FILE__, __LINE__)
-
-namespace cg = cooperative_groups;
-
-__global__ void attn_softmax_v2(__half* vals,
-                                __half* mask,
-                                bool triangular,
-                                bool recompute,
-                                bool local_attention,
-                                int window_size,
-                                int total_count,
-                                int heads,
-                                int sequence_length,
-                                int num_seq,
-                                float scale,
-                                int iterations,
-                                int reduceWidth)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    float2 low_data[MAX_REG_SIZE];
-    float2 high_data[MAX_REG_SIZE];
-
-    __half2 h_scale = __float2half2_rn(scale);
-
-    int wid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-
-    int reduce_blocks = reduceWidth >> 5;
-    int seq_lane = threadIdx.x % reduceWidth;
-
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int iter_offset = blockIdx.x * (warp_num / reduce_blocks) + (wid / reduce_blocks);
-
-    if (iter_offset < total_count) {
-        vals += (iter_offset * sequence_length);
-
-        int mask_offset = (iter_offset / (heads * num_seq)) * (sequence_length);
-        int seq_id = iter_offset % num_seq;
-        int seq_id4 = seq_id >> 2;
-
-        int real_seq_id = seq_id + (num_seq == sequence_length ? 0 : sequence_length);
-        int window_stride4 = (local_attention && (real_seq_id >> 2) > (window_size >> 2))
-                                 ? (real_seq_id >> 2) - (window_size >> 2)
-                                 : 0;
-        int window_stride =
-            (local_attention && real_seq_id >= window_size) ? real_seq_id - window_size : -1;
-
-        float max_val = minus_infinity;
-
-        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-            if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
-                data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    low_data[i].x = data_id > window_stride ? __half2float(vals[data_id])
-                                                            : minus_infinity;
-                    low_data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
-                                     (data_id + 1) > window_stride)
-                                        ? __half2float(vals[data_id + 1])
-                                        : minus_infinity;
-                    high_data[i].x = ((!triangular || ((data_id + 2) <= seq_id)) &&
-                                      (data_id + 2) > window_stride)
-                                         ? __half2float(vals[data_id + 2])
-                                         : minus_infinity;
-                    high_data[i].y = ((!triangular || ((data_id + 3) <= seq_id)) &&
-                                      (data_id + 3) > window_stride)
-                                         ? __half2float(vals[data_id + 3])
-                                         : minus_infinity;
-                    if (mask && recompute) {
-                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
-                        low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
-                        high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
-                        high_data[i].y += __half2float(mask[data_id + mask_offset + 3]);
-                    }
-                } else {
-                    low_data[i].x = data_id > window_stride ? __half2float(vals[data_id])
-                                                            : minus_infinity;
-                    low_data[i].y = (((!triangular || (data_id + 1) <= seq_id) &&
-                                      (data_id + 1) > window_stride) &&
-                                     (data_id + 1) < sequence_length)
-                                        ? __half2float(vals[data_id + 1])
-                                        : minus_infinity;
-                    high_data[i].x = (((!triangular || (data_id + 2) <= seq_id) &&
-                                       (data_id + 2) > window_stride) &&
-                                      (data_id + 2) < sequence_length)
-                                         ? __half2float(vals[data_id + 2])
-                                         : minus_infinity;
-                    high_data[i].y = minus_infinity;
-                    if (mask && recompute) {
-                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
-                        if ((data_id + 1) < sequence_length)
-                            low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
-                        if ((data_id + 2) < sequence_length)
-                            high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
-                    }
-                }
-                // if(lane == 0) printf("%f , %d, %d \n", low_data[i].x, data_id, seq_id);
-                max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
-                max_val = (low_data[i].y > max_val ? low_data[i].y : max_val);
-                max_val = (high_data[i].x > max_val ? high_data[i].x : max_val);
-                max_val = (high_data[i].y > max_val ? high_data[i].y : max_val);
-            } else {
-                low_data[i].x = minus_infinity;
-                low_data[i].y = minus_infinity;
-                high_data[i].x = minus_infinity;
-                high_data[i].y = minus_infinity;
-            }
-        }
-
-        for (int i = 1; i < WARP_SIZE; i *= 2) {
-            auto temp = g.shfl_xor(max_val, i);
-            max_val = (temp > max_val ? temp : max_val);
-        }
-
-        if (reduceWidth > WARP_SIZE) {
-            if (lane == 0) partialSum[wid] = max_val;
-            b.sync();
-
-            if (lane < warp_num) max_val = partialSum[lane];
-
-            b.sync();
-
-            for (int i = 1; i < reduce_blocks; i *= 2) {
-                auto temp = g.shfl_xor(max_val, i);
-                max_val = (temp > max_val ? temp : max_val);
-            }
-
-            max_val = g.shfl(max_val, threadIdx.x / WARP_SIZE);
-        }
-        float sum = 0;
-        for (int i = 0; i < iterations; i++) {
-            low_data[i].x = __expf(low_data[i].x - max_val);
-            low_data[i].y = __expf(low_data[i].y - max_val);
-            high_data[i].x = __expf(high_data[i].x - max_val);
-            high_data[i].y = __expf(high_data[i].y - max_val);
-
-            sum += (low_data[i].x + low_data[i].y + high_data[i].x + high_data[i].y);
-        }
-
-        for (int i = 1; i < WARP_SIZE; i *= 2) sum += g.shfl_xor(sum, i);
-
-        if (reduceWidth > WARP_SIZE) {
-            if (lane == 0) partialSum[wid] = sum;
-            b.sync();
-
-            if (lane < warp_num) sum = partialSum[lane];
-
-            b.sync();
-
-            for (int i = 1; i < reduce_blocks; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-            sum = g.shfl(sum, threadIdx.x / WARP_SIZE);
-        }
-        sum += 1e-6;
-        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-
-            if (data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    vals[data_id] = low_data[i].x / sum;
-                    vals[data_id + 1] = low_data[i].y / sum;
-                    vals[data_id + 2] = high_data[i].x / sum;
-                    vals[data_id + 3] = high_data[i].y / sum;
-                } else {
-                    vals[data_id] = low_data[i].x / sum;
-                    if ((data_id + 1) < sequence_length) vals[data_id + 1] = low_data[i].y / sum;
-                    if ((data_id + 2) < sequence_length) vals[data_id + 2] = high_data[i].x / sum;
-                }
-            }
-        }
-    }
-#endif
-}
-
-__global__ void attn_softmax_v2(float* vals,
-                                float* attn_mask,
-                                bool triangular,
-                                bool recompute,
-                                bool local_attention,
-                                int window_size,
-                                int total_count,
-                                int heads,
-                                int sequence_length,
-                                int num_seq,
-                                float scale,
-                                int iterations,
-                                int reduceWidth)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    float4 data[MAX_REG_SIZE];
-
-    int wid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-
-    int reduce_blocks = reduceWidth >> 5;
-    int seq_lane = threadIdx.x % reduceWidth;
-
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int iter_offset = blockIdx.x * (warp_num / reduce_blocks) + (wid / reduce_blocks);
-    if (iter_offset < total_count) {
-        vals += (iter_offset * sequence_length);
-
-        int mask_offset = (iter_offset / (heads * num_seq)) * (sequence_length);
-        int seq_id = iter_offset % num_seq;
-        int seq_id4 = seq_id >> 2;
-
-        int real_seq_id = seq_id + (num_seq == sequence_length ? 0 : sequence_length);
-        int window_stride4 = (local_attention && (real_seq_id >> 2) > (window_size >> 2))
-                                 ? (real_seq_id >> 2) - (window_size >> 2)
-                                 : 0;
-        int window_stride =
-            (local_attention && real_seq_id >= window_size) ? real_seq_id - window_size : -1;
-
-        float max_val = minus_infinity;
-
-        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-            if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
-                data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    data[i].x = (data_id > window_stride ? vals[data_id] : minus_infinity);
-                    data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
-                                 (data_id + 1) > window_stride)
-                                    ? vals[data_id + 1]
-                                    : minus_infinity;
-                    data[i].z = ((!triangular || ((data_id + 2) <= seq_id)) &&
-                                 (data_id + 2) > window_stride)
-                                    ? vals[data_id + 2]
-                                    : minus_infinity;
-                    data[i].w = ((!triangular || ((data_id + 3) <= seq_id)) &&
-                                 (data_id + 3) > window_stride)
-                                    ? vals[data_id + 3]
-                                    : minus_infinity;
-                    if (attn_mask && recompute) {
-                        data[i].x += attn_mask[data_id + mask_offset];
-                        data[i].y += attn_mask[data_id + mask_offset + 1];
-                        data[i].z += attn_mask[data_id + mask_offset + 2];
-                        data[i].w += attn_mask[data_id + mask_offset + 3];
-                    }
-                } else {
-                    data[i].x = data_id > window_stride ? vals[data_id] : minus_infinity;
-                    data[i].y = (((!triangular || (data_id + 1) <= seq_id)) &&
-                                 (data_id + 1) > window_stride && (data_id + 1) < sequence_length)
-                                    ? (vals[data_id + 1])
-                                    : minus_infinity;
-                    data[i].z = (((!triangular || (data_id + 2) <= seq_id)) &&
-                                 (data_id + 2) > window_stride && (data_id + 2) < sequence_length)
-                                    ? (vals[data_id + 2])
-                                    : minus_infinity;
-                    data[i].w = minus_infinity;
-                    if (attn_mask && recompute) {
-                        data[i].x += attn_mask[data_id + mask_offset];
-                        if ((data_id + 1) < sequence_length)
-                            data[i].y += attn_mask[data_id + mask_offset + 1];
-                        if ((data_id + 2) < sequence_length)
-                            data[i].z += attn_mask[data_id + mask_offset + 2];
-                    }
-                }
-                max_val = (data[i].x > max_val ? data[i].x : max_val);
-                max_val = (data[i].y > max_val ? data[i].y : max_val);
-                max_val = (data[i].z > max_val ? data[i].z : max_val);
-                max_val = (data[i].w > max_val ? data[i].w : max_val);
-            } else {
-                data[i].x = minus_infinity;
-                data[i].y = minus_infinity;
-                data[i].z = minus_infinity;
-                data[i].w = minus_infinity;
-            }
-        }
-
-        for (int i = 1; i < WARP_SIZE; i *= 2) {
-            auto temp = g.shfl_xor(max_val, i);
-            max_val = (temp > max_val ? temp : max_val);
-        }
-
-        if (reduceWidth > WARP_SIZE) {
-            if (lane == 0) partialSum[wid] = max_val;
-            b.sync();
-
-            if (lane < warp_num) max_val = partialSum[lane];
-
-            b.sync();
-
-            for (int i = 1; i < reduce_blocks; i *= 2) {
-                auto temp = g.shfl_xor(max_val, i);
-                max_val = (temp > max_val ? temp : max_val);
-            }
-
-            max_val = g.shfl(max_val, threadIdx.x / WARP_SIZE);
-        }
-
-        float sum = 0;
-        for (int i = 0; i < iterations; i++) {
-            data[i].x = __expf(data[i].x - max_val);
-            data[i].y = __expf(data[i].y - max_val);
-            data[i].z = __expf(data[i].z - max_val);
-            data[i].w = __expf(data[i].w - max_val);
-
-            sum += (data[i].x + data[i].y + data[i].z + data[i].w);
-        }
-
-        for (int i = 1; i < WARP_SIZE; i *= 2) sum += g.shfl_xor(sum, i);
-
-        if (reduceWidth > WARP_SIZE) {
-            if (lane == 0) partialSum[wid] = sum;
-            b.sync();
-
-            if (lane < warp_num) sum = partialSum[lane];
-
-            b.sync();
-
-            for (int i = 1; i < reduce_blocks; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-            sum = g.shfl(sum, threadIdx.x / WARP_SIZE);
-        }
-        sum += 1e-6;
-
-        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-
-            if (data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    vals[data_id] = data[i].x / sum;
-                    vals[data_id + 1] = data[i].y / sum;
-                    vals[data_id + 2] = data[i].z / sum;
-                    vals[data_id + 3] = data[i].w / sum;
-                } else {
-                    vals[data_id] = data[i].x / sum;
-                    if ((data_id + 1) < sequence_length) vals[data_id + 1] = data[i].y / sum;
-                    if ((data_id + 2) < sequence_length) vals[data_id + 2] = data[i].z / sum;
-                }
-            }
-        }
-    }
-}
-
-template <typename T>
-void launch_attn_softmax_v2(T* vals,
-                            T* mask,
-                            bool triangular,
-                            bool recompute,
-                            bool local_attention,
-                            int window_size,
-                            int batch_size,
-                            int heads,
-                            int num_seq,
-                            int sequence_length,
-                            float scale,
-                            cudaStream_t stream)
-{
-    int total_count = batch_size * heads * num_seq;
-    dim3 grid_dim((total_count - 1) / (WARP_SIZE / ((sequence_length - 1) / ATTN_THREADS + 1)) + 1);
-    dim3 block_dim(ATTN_THREADS);
-
-    const int reduce_width = ((sequence_length - 1) / ATTN_THREADS + 1) * WARP_SIZE;
-    const int iterations = (sequence_length - 1) / (reduce_width << 2) + 1;
-
-    if (sequence_length <= 32768)
-        attn_softmax_v2<<<grid_dim, block_dim, 0, stream>>>(
-            vals,
-            mask,
-            triangular,
-            recompute,
-            local_attention,
-            window_size,
-            total_count,
-            (triangular ? (heads * batch_size) : heads),
-            sequence_length,
-            num_seq,
-            scale,
-            iterations,
-            reduce_width);
-    else
-        throw std::runtime_error("Unsupport Seq_Length!");
-}
-
-template void launch_attn_softmax_v2(float* vals,
-                                     float* mask,
-                                     bool triangular,
-                                     bool recompute,
-                                     bool local_attention,
-                                     int window_size,
-                                     int batch_size,
-                                     int heads,
-                                     int num_seq,
-                                     int sequence_length,
-                                     float scale,
-                                     cudaStream_t stream);
-template void launch_attn_softmax_v2(__half* vals,
-                                     __half* mask,
-                                     bool triangular,
-                                     bool recompute,
-                                     bool local_attention,
-                                     int window_size,
-                                     int batch_size,
-                                     int heads,
-                                     int num_seq,
-                                     int sequence_length,
-                                     float scale,
-                                     cudaStream_t stream);
diff --git a/deepspeed/ops/csrc/transformer_bak/inference/csrc/softmax.hip b/deepspeed/ops/csrc/transformer_bak/inference/csrc/softmax.hip
deleted file mode 100644
index a933d5177295f9f483638d19ecc89d3ae5f3937d..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/inference/csrc/softmax.hip
+++ /dev/null
@@ -1,434 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include <limits>
-#include "custom_hip_layers.h"
-
-//#include <cuda_profiler_api.h>
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-
-#define ATTN_THREADS 1024
-#define MAX_REG_SIZE 8
-
-#define minus_infinity -10000.0
-
-void CheckCudaErrorAux(const char* file, unsigned line)
-{
-    hipError_t err = hipGetLastError();
-    if (err == hipSuccess) return;
-    std::cerr << hipGetErrorString(err) << "(" << err << ") at " << file << ":" << line
-              << std::endl;
-    throw std::runtime_error("CUDA ERROR!!!\n");
-}
-
-#define CUDA_CHECK_ERROR() CheckCudaErrorAux(__FILE__, __LINE__)
-
-namespace cg = cooperative_groups;
-
-__global__ void attn_softmax_v2(__half* vals,
-                                __half* mask,
-                                bool triangular,
-                                bool recompute,
-                                bool local_attention,
-                                int window_size,
-                                int total_count,
-                                int heads,
-                                int sequence_length,
-                                int num_seq,
-                                float scale,
-                                int iterations,
-                                int reduceWidth)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    float2 low_data[MAX_REG_SIZE];
-    float2 high_data[MAX_REG_SIZE];
-
-    __half2 h_scale = __float2half2_rn(scale);
-
-    int wid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-
-    int reduce_blocks = reduceWidth >> 5;
-    int seq_lane = threadIdx.x % reduceWidth;
-
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int iter_offset = blockIdx.x * (warp_num / reduce_blocks) + (wid / reduce_blocks);
-
-    if (iter_offset < total_count) {
-        vals += (iter_offset * sequence_length);
-
-        int mask_offset = (iter_offset / (heads * num_seq)) * (sequence_length);
-        int seq_id = iter_offset % num_seq;
-        int seq_id4 = seq_id >> 2;
-
-        int real_seq_id = seq_id + (num_seq == sequence_length ? 0 : sequence_length);
-        int window_stride4 = (local_attention && (real_seq_id >> 2) > (window_size >> 2))
-                                 ? (real_seq_id >> 2) - (window_size >> 2)
-                                 : 0;
-        int window_stride =
-            (local_attention && real_seq_id >= window_size) ? real_seq_id - window_size : -1;
-
-        float max_val = minus_infinity;
-
-        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-            if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
-                data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    low_data[i].x = data_id > window_stride ? __half2float(vals[data_id])
-                                                            : minus_infinity;
-                    low_data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
-                                     (data_id + 1) > window_stride)
-                                        ? __half2float(vals[data_id + 1])
-                                        : minus_infinity;
-                    high_data[i].x = ((!triangular || ((data_id + 2) <= seq_id)) &&
-                                      (data_id + 2) > window_stride)
-                                         ? __half2float(vals[data_id + 2])
-                                         : minus_infinity;
-                    high_data[i].y = ((!triangular || ((data_id + 3) <= seq_id)) &&
-                                      (data_id + 3) > window_stride)
-                                         ? __half2float(vals[data_id + 3])
-                                         : minus_infinity;
-                    if (mask && recompute) {
-                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
-                        low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
-                        high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
-                        high_data[i].y += __half2float(mask[data_id + mask_offset + 3]);
-                    }
-                } else {
-                    low_data[i].x = data_id > window_stride ? __half2float(vals[data_id])
-                                                            : minus_infinity;
-                    low_data[i].y = (((!triangular || (data_id + 1) <= seq_id) &&
-                                      (data_id + 1) > window_stride) &&
-                                     (data_id + 1) < sequence_length)
-                                        ? __half2float(vals[data_id + 1])
-                                        : minus_infinity;
-                    high_data[i].x = (((!triangular || (data_id + 2) <= seq_id) &&
-                                       (data_id + 2) > window_stride) &&
-                                      (data_id + 2) < sequence_length)
-                                         ? __half2float(vals[data_id + 2])
-                                         : minus_infinity;
-                    high_data[i].y = minus_infinity;
-                    if (mask && recompute) {
-                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
-                        if ((data_id + 1) < sequence_length)
-                            low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
-                        if ((data_id + 2) < sequence_length)
-                            high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
-                    }
-                }
-                // if(lane == 0) printf("%f , %d, %d \n", low_data[i].x, data_id, seq_id);
-                max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
-                max_val = (low_data[i].y > max_val ? low_data[i].y : max_val);
-                max_val = (high_data[i].x > max_val ? high_data[i].x : max_val);
-                max_val = (high_data[i].y > max_val ? high_data[i].y : max_val);
-            } else {
-                low_data[i].x = minus_infinity;
-                low_data[i].y = minus_infinity;
-                high_data[i].x = minus_infinity;
-                high_data[i].y = minus_infinity;
-            }
-        }
-
-        for (int i = 1; i < WARP_SIZE; i *= 2) {
-            auto temp = g.shfl_xor(max_val, i);
-            max_val = (temp > max_val ? temp : max_val);
-        }
-
-        if (reduceWidth > WARP_SIZE) {
-            if (lane == 0) partialSum[wid] = max_val;
-            b.sync();
-
-            if (lane < warp_num) max_val = partialSum[lane];
-
-            b.sync();
-
-            for (int i = 1; i < reduce_blocks; i *= 2) {
-                auto temp = g.shfl_xor(max_val, i);
-                max_val = (temp > max_val ? temp : max_val);
-            }
-
-            max_val = g.shfl(max_val, threadIdx.x / WARP_SIZE);
-        }
-        float sum = 0;
-        for (int i = 0; i < iterations; i++) {
-            low_data[i].x = __expf(low_data[i].x - max_val);
-            low_data[i].y = __expf(low_data[i].y - max_val);
-            high_data[i].x = __expf(high_data[i].x - max_val);
-            high_data[i].y = __expf(high_data[i].y - max_val);
-
-            sum += (low_data[i].x + low_data[i].y + high_data[i].x + high_data[i].y);
-        }
-
-        for (int i = 1; i < WARP_SIZE; i *= 2) sum += g.shfl_xor(sum, i);
-
-        if (reduceWidth > WARP_SIZE) {
-            if (lane == 0) partialSum[wid] = sum;
-            b.sync();
-
-            if (lane < warp_num) sum = partialSum[lane];
-
-            b.sync();
-
-            for (int i = 1; i < reduce_blocks; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-            sum = g.shfl(sum, threadIdx.x / WARP_SIZE);
-        }
-        sum += 1e-6;
-        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-
-            if (data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    vals[data_id] = low_data[i].x / sum;
-                    vals[data_id + 1] = low_data[i].y / sum;
-                    vals[data_id + 2] = high_data[i].x / sum;
-                    vals[data_id + 3] = high_data[i].y / sum;
-                } else {
-                    vals[data_id] = low_data[i].x / sum;
-                    if ((data_id + 1) < sequence_length) vals[data_id + 1] = low_data[i].y / sum;
-                    if ((data_id + 2) < sequence_length) vals[data_id + 2] = high_data[i].x / sum;
-                }
-            }
-        }
-    }
-#endif
-}
-
-__global__ void attn_softmax_v2(float* vals,
-                                float* attn_mask,
-                                bool triangular,
-                                bool recompute,
-                                bool local_attention,
-                                int window_size,
-                                int total_count,
-                                int heads,
-                                int sequence_length,
-                                int num_seq,
-                                float scale,
-                                int iterations,
-                                int reduceWidth)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    float4 data[MAX_REG_SIZE];
-
-    int wid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-
-    int reduce_blocks = reduceWidth >> 5;
-    int seq_lane = threadIdx.x % reduceWidth;
-
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int iter_offset = blockIdx.x * (warp_num / reduce_blocks) + (wid / reduce_blocks);
-    if (iter_offset < total_count) {
-        vals += (iter_offset * sequence_length);
-
-        int mask_offset = (iter_offset / (heads * num_seq)) * (sequence_length);
-        int seq_id = iter_offset % num_seq;
-        int seq_id4 = seq_id >> 2;
-
-        int real_seq_id = seq_id + (num_seq == sequence_length ? 0 : sequence_length);
-        int window_stride4 = (local_attention && (real_seq_id >> 2) > (window_size >> 2))
-                                 ? (real_seq_id >> 2) - (window_size >> 2)
-                                 : 0;
-        int window_stride =
-            (local_attention && real_seq_id >= window_size) ? real_seq_id - window_size : -1;
-
-        float max_val = minus_infinity;
-
-        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-            if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
-                data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    data[i].x = (data_id > window_stride ? vals[data_id] : minus_infinity);
-                    data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
-                                 (data_id + 1) > window_stride)
-                                    ? vals[data_id + 1]
-                                    : minus_infinity;
-                    data[i].z = ((!triangular || ((data_id + 2) <= seq_id)) &&
-                                 (data_id + 2) > window_stride)
-                                    ? vals[data_id + 2]
-                                    : minus_infinity;
-                    data[i].w = ((!triangular || ((data_id + 3) <= seq_id)) &&
-                                 (data_id + 3) > window_stride)
-                                    ? vals[data_id + 3]
-                                    : minus_infinity;
-                    if (attn_mask && recompute) {
-                        data[i].x += attn_mask[data_id + mask_offset];
-                        data[i].y += attn_mask[data_id + mask_offset + 1];
-                        data[i].z += attn_mask[data_id + mask_offset + 2];
-                        data[i].w += attn_mask[data_id + mask_offset + 3];
-                    }
-                } else {
-                    data[i].x = data_id > window_stride ? vals[data_id] : minus_infinity;
-                    data[i].y = (((!triangular || (data_id + 1) <= seq_id)) &&
-                                 (data_id + 1) > window_stride && (data_id + 1) < sequence_length)
-                                    ? (vals[data_id + 1])
-                                    : minus_infinity;
-                    data[i].z = (((!triangular || (data_id + 2) <= seq_id)) &&
-                                 (data_id + 2) > window_stride && (data_id + 2) < sequence_length)
-                                    ? (vals[data_id + 2])
-                                    : minus_infinity;
-                    data[i].w = minus_infinity;
-                    if (attn_mask && recompute) {
-                        data[i].x += attn_mask[data_id + mask_offset];
-                        if ((data_id + 1) < sequence_length)
-                            data[i].y += attn_mask[data_id + mask_offset + 1];
-                        if ((data_id + 2) < sequence_length)
-                            data[i].z += attn_mask[data_id + mask_offset + 2];
-                    }
-                }
-                max_val = (data[i].x > max_val ? data[i].x : max_val);
-                max_val = (data[i].y > max_val ? data[i].y : max_val);
-                max_val = (data[i].z > max_val ? data[i].z : max_val);
-                max_val = (data[i].w > max_val ? data[i].w : max_val);
-            } else {
-                data[i].x = minus_infinity;
-                data[i].y = minus_infinity;
-                data[i].z = minus_infinity;
-                data[i].w = minus_infinity;
-            }
-        }
-
-        for (int i = 1; i < WARP_SIZE; i *= 2) {
-            auto temp = g.shfl_xor(max_val, i);
-            max_val = (temp > max_val ? temp : max_val);
-        }
-
-        if (reduceWidth > WARP_SIZE) {
-            if (lane == 0) partialSum[wid] = max_val;
-            b.sync();
-
-            if (lane < warp_num) max_val = partialSum[lane];
-
-            b.sync();
-
-            for (int i = 1; i < reduce_blocks; i *= 2) {
-                auto temp = g.shfl_xor(max_val, i);
-                max_val = (temp > max_val ? temp : max_val);
-            }
-
-            max_val = g.shfl(max_val, threadIdx.x / WARP_SIZE);
-        }
-
-        float sum = 0;
-        for (int i = 0; i < iterations; i++) {
-            data[i].x = __expf(data[i].x - max_val);
-            data[i].y = __expf(data[i].y - max_val);
-            data[i].z = __expf(data[i].z - max_val);
-            data[i].w = __expf(data[i].w - max_val);
-
-            sum += (data[i].x + data[i].y + data[i].z + data[i].w);
-        }
-
-        for (int i = 1; i < WARP_SIZE; i *= 2) sum += g.shfl_xor(sum, i);
-
-        if (reduceWidth > WARP_SIZE) {
-            if (lane == 0) partialSum[wid] = sum;
-            b.sync();
-
-            if (lane < warp_num) sum = partialSum[lane];
-
-            b.sync();
-
-            for (int i = 1; i < reduce_blocks; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-            sum = g.shfl(sum, threadIdx.x / WARP_SIZE);
-        }
-        sum += 1e-6;
-
-        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-
-            if (data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    vals[data_id] = data[i].x / sum;
-                    vals[data_id + 1] = data[i].y / sum;
-                    vals[data_id + 2] = data[i].z / sum;
-                    vals[data_id + 3] = data[i].w / sum;
-                } else {
-                    vals[data_id] = data[i].x / sum;
-                    if ((data_id + 1) < sequence_length) vals[data_id + 1] = data[i].y / sum;
-                    if ((data_id + 2) < sequence_length) vals[data_id + 2] = data[i].z / sum;
-                }
-            }
-        }
-    }
-}
-
-template <typename T>
-void launch_attn_softmax_v2(T* vals,
-                            T* mask,
-                            bool triangular,
-                            bool recompute,
-                            bool local_attention,
-                            int window_size,
-                            int batch_size,
-                            int heads,
-                            int num_seq,
-                            int sequence_length,
-                            float scale,
-                            hipStream_t stream)
-{
-    int total_count = batch_size * heads * num_seq;
-    dim3 grid_dim((total_count - 1) / (WARP_SIZE / ((sequence_length - 1) / ATTN_THREADS + 1)) + 1);
-    dim3 block_dim(ATTN_THREADS);
-
-    const int reduce_width = ((sequence_length - 1) / ATTN_THREADS + 1) * WARP_SIZE;
-    const int iterations = (sequence_length - 1) / (reduce_width << 2) + 1;
-
-    if (sequence_length <= 32768)
-       hipLaunchKernelGGL(( attn_softmax_v2), dim3(grid_dim), dim3(block_dim), 0, stream, 
-            vals,
-            mask,
-            triangular,
-            recompute,
-            local_attention,
-            window_size,
-            total_count,
-            (triangular ? (heads * batch_size) : heads),
-            sequence_length,
-            num_seq,
-            scale,
-            iterations,
-            reduce_width);
-    else
-        throw std::runtime_error("Unsupport Seq_Length!");
-}
-
-template void launch_attn_softmax_v2(float* vals,
-                                     float* mask,
-                                     bool triangular,
-                                     bool recompute,
-                                     bool local_attention,
-                                     int window_size,
-                                     int batch_size,
-                                     int heads,
-                                     int num_seq,
-                                     int sequence_length,
-                                     float scale,
-                                     hipStream_t stream);
-template void launch_attn_softmax_v2(__half* vals,
-                                     __half* mask,
-                                     bool triangular,
-                                     bool recompute,
-                                     bool local_attention,
-                                     int window_size,
-                                     int batch_size,
-                                     int heads,
-                                     int num_seq,
-                                     int sequence_length,
-                                     float scale,
-                                     hipStream_t stream);
diff --git a/deepspeed/ops/csrc/transformer_bak/inference/includes/context.h b/deepspeed/ops/csrc/transformer_bak/inference/includes/context.h
deleted file mode 100644
index 79008d4f3402bca94fdc411c2f1a07b76078f8a1..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/inference/includes/context.h
+++ /dev/null
@@ -1,177 +0,0 @@
-#pragma once
-
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda_runtime_api.h>
-#include <cassert>
-#include <iostream>
-#include <vector>
-#include "cublas_v2.h"
-#include "cuda.h"
-#include "curand.h"
-
-#define WARP_SIZE 32
-
-#define CUDA_CHECK(callstr)                                                                    \
-    {                                                                                          \
-        cudaError_t error_code = callstr;                                                      \
-        if (error_code != cudaSuccess) {                                                       \
-            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
-            assert(0);                                                                         \
-        }                                                                                      \
-    }
-
-#define CUDA_1D_KERNEL_LOOP(i, n) \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
-
-#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                                                          \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
-        for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
-
-#define DS_CUDA_NUM_THREADS 512
-#define DS_MAXIMUM_NUM_BLOCKS 262144
-
-inline int DS_GET_BLOCKS(const int N)
-{
-    return std::max(
-        std::min((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
-        // Use at least 1 block, since CUDA does not allow empty block
-        1);
-}
-
-class Context {
-public:
-    Context() : _workspace(nullptr), _seed(42), _curr_offset(0), _stream(0)
-    {
-        curandCreateGenerator(&_gen, CURAND_RNG_PSEUDO_DEFAULT);
-        curandSetPseudoRandomGeneratorSeed(_gen, 123);
-        if (cublasCreate(&_cublasHandle) != CUBLAS_STATUS_SUCCESS) {
-            auto message = std::string("Fail to create cublas handle.");
-            std::cerr << message << std::endl;
-            throw std::runtime_error(message);
-        }
-        cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
-        cudaEventCreate(&_comp1_event, (cudaEventDisableTiming | cudaEventBlockingSync));
-        cudaEventCreate(&_comp2_event, (cudaEventDisableTiming | cudaEventBlockingSync));
-        cudaEventCreate(&_comp_event, (cudaEventDisableTiming | cudaEventBlockingSync));
-        cudaEventCreate(&_comm_event, (cudaEventDisableTiming | cudaEventBlockingSync));
-    }
-
-    virtual ~Context()
-    {
-        cublasDestroy(_cublasHandle);
-        cudaFree(_workspace);
-        cudaEventDestroy(_comp1_event);
-        cudaEventDestroy(_comp2_event);
-        cudaEventDestroy(_comp_event);
-        cudaEventDestroy(_comm_event);
-    }
-
-    static Context& Instance()
-    {
-        static Context _ctx;
-        return _ctx;
-    }
-
-    void GenWorkSpace(size_t size)
-    {
-        if (!_workspace) {
-            assert(_workspace == nullptr);
-            cudaMalloc(&_workspace, size);
-        } else if (_workSpaceSize < size) {
-            cudaFree(_workspace);
-            cudaMalloc(&_workspace, size);
-        }
-
-        _workSpaceSize = size;
-    }
-
-    cudaEvent_t GetCompEvent(int id) { return id == 1 ? _comp1_event : _comp2_event; }
-
-    size_t get_workspace_size() const { return _workSpaceSize; }
-    void* GetWorkSpace() { return _workspace; }
-
-    inline unsigned new_token(unsigned layer_id)
-    {
-        if (layer_id == 0) _token_length++;
-        return _token_length;
-    }
-
-    inline void reset_tokens(unsigned initial_tokens = 0)
-    {
-        _num_tokens = initial_tokens;
-    }  //_token_length = 0; }
-
-    inline unsigned current_tokens() const { return _num_tokens; }
-
-    inline void advance_tokens() { _num_tokens++; }
-
-    curandGenerator_t& GetRandGenerator() { return _gen; }
-
-    cudaStream_t GetCommStream(bool async_op = false)
-    {
-        if (!_comm_stream)
-            _comm_stream = async_op ? at::cuda::getStreamFromPool(true)
-                                    : at::cuda::getCurrentCUDAStream();
-        return _comm_stream;
-    }
-    cudaStream_t GetCurrentStream(bool other_stream = false)
-    {
-        // get current pytorch stream.
-        if (other_stream) {
-            if (!_stream) _stream = at::cuda::getStreamFromPool(true);
-            return _stream;
-        }
-        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-        return stream;
-    }
-
-    cublasHandle_t GetCublasHandle() { return _cublasHandle; }
-
-    std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
-    {
-        uint64_t offset = _curr_offset;
-        _curr_offset += offset_inc;
-        return std::pair<uint64_t, uint64_t>(_seed, offset);
-    }
-
-    void SetSeed(uint64_t new_seed) { _seed = new_seed; }
-
-    const std::vector<std::array<int, 3>>& GetGemmAlgos() const { return _gemm_algos; }
-
-    inline void SynchComp()
-    {
-        cudaEventRecord(_comp_event, _comp_stream);
-        cudaStreamWaitEvent(_comm_stream, _comp_event, 0);
-    }
-    inline void SynchComm()
-    {
-        cudaEventRecord(_comm_event, _comm_stream);
-        cudaStreamWaitEvent(_comp_stream, _comm_event, 0);
-    }
-
-private:
-    curandGenerator_t _gen;
-    cublasHandle_t _cublasHandle;
-
-    cudaEvent_t _comp_event;
-    cudaEvent_t _comm_event;
-
-    void* _workspace;
-    uint64_t _seed;
-    uint64_t _curr_offset;
-    size_t _workSpaceSize;
-
-    cudaEvent_t _comp1_event;
-    cudaEvent_t _comp2_event;
-
-    cudaStream_t _stream;
-
-    unsigned _token_length;
-    unsigned _num_tokens;
-    std::vector<std::array<int, 3>> _gemm_algos;
-
-    cudaStream_t _comp_stream;
-    cudaStream_t _comm_stream;
-
-    std::unordered_map<int, int> _world_sizes;
-};
diff --git a/deepspeed/ops/csrc/transformer_bak/inference/includes/context_hip.h b/deepspeed/ops/csrc/transformer_bak/inference/includes/context_hip.h
deleted file mode 100644
index 89c6299bfbf89f3248a492fcfd6e7c61cb1df9fd..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/inference/includes/context_hip.h
+++ /dev/null
@@ -1,178 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#include <ATen/hip/HIPContext.h>
-#include <hip/hip_runtime_api.h>
-#include <cassert>
-#include <iostream>
-#include <vector>
-#include "rocblas.h"
-#include "hip/hip_runtime.h"
-#include "hiprand/hiprand.h"
-
-#define WARP_SIZE 32
-
-#define CUDA_CHECK(callstr)                                                                    \
-    {                                                                                          \
-        hipError_t error_code = callstr;                                                      \
-        if (error_code != hipSuccess) {                                                       \
-            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
-            assert(0);                                                                         \
-        }                                                                                      \
-    }
-
-#define CUDA_1D_KERNEL_LOOP(i, n) \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
-
-#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                                                          \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
-        for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
-
-#define DS_CUDA_NUM_THREADS 512
-#define DS_MAXIMUM_NUM_BLOCKS 262144
-
-inline int DS_GET_BLOCKS(const int N)
-{
-    return std::max(
-        std::min((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
-        // Use at least 1 block, since CUDA does not allow empty block
-        1);
-}
-
-class Context {
-public:
-    Context() : _workspace(nullptr), _seed(42), _curr_offset(0), _stream(0)
-    {
-        hiprandCreateGenerator(&_gen, HIPRAND_RNG_PSEUDO_DEFAULT);
-        hiprandSetPseudoRandomGeneratorSeed(_gen, 123);
-        if (rocblas_create_handle(&_cublasHandle) != rocblas_status_success) {
-            auto message = std::string("Fail to create cublas handle.");
-            std::cerr << message << std::endl;
-            throw std::runtime_error(message);
-        }
-        rocblas_set_math_mode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
-        hipEventCreate(&_comp1_event, (hipEventDisableTiming | hipEventBlockingSync));
-        hipEventCreate(&_comp2_event, (hipEventDisableTiming | hipEventBlockingSync));
-        hipEventCreate(&_comp_event, (hipEventDisableTiming | hipEventBlockingSync));
-        hipEventCreate(&_comm_event, (hipEventDisableTiming | hipEventBlockingSync));
-    }
-
-    virtual ~Context()
-    {
-        rocblas_destroy_handle(_cublasHandle);
-        hipFree(_workspace);
-        hipEventDestroy(_comp1_event);
-        hipEventDestroy(_comp2_event);
-        hipEventDestroy(_comp_event);
-        hipEventDestroy(_comm_event);
-    }
-
-    static Context& Instance()
-    {
-        static Context _ctx;
-        return _ctx;
-    }
-
-    void GenWorkSpace(size_t size)
-    {
-        if (!_workspace) {
-            assert(_workspace == nullptr);
-            hipMalloc(&_workspace, size);
-        } else if (_workSpaceSize < size) {
-            hipFree(_workspace);
-            hipMalloc(&_workspace, size);
-        }
-
-        _workSpaceSize = size;
-    }
-
-    hipEvent_t GetCompEvent(int id) { return id == 1 ? _comp1_event : _comp2_event; }
-
-    size_t get_workspace_size() const { return _workSpaceSize; }
-    void* GetWorkSpace() { return _workspace; }
-
-    inline unsigned new_token(unsigned layer_id)
-    {
-        if (layer_id == 0) _token_length++;
-        return _token_length;
-    }
-
-    inline void reset_tokens(unsigned initial_tokens = 0)
-    {
-        _num_tokens = initial_tokens;
-    }  //_token_length = 0; }
-
-    inline unsigned current_tokens() const { return _num_tokens; }
-
-    inline void advance_tokens() { _num_tokens++; }
-
-    hiprandGenerator_t& GetRandGenerator() { return _gen; }
-
-    hipStream_t GetCommStream(bool async_op = false)
-    {
-        if (!_comm_stream)
-            _comm_stream = async_op ? at::hip::getStreamFromPoolMasqueradingAsCUDA(true)
-                                    : at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-        return _comm_stream;
-    }
-    hipStream_t GetCurrentStream(bool other_stream = false)
-    {
-        // get current pytorch stream.
-        if (other_stream) {
-            if (!_stream) _stream = at::hip::getStreamFromPoolMasqueradingAsCUDA(true);
-            return _stream;
-        }
-        hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-        return stream;
-    }
-
-    rocblas_handle GetCublasHandle() { return _cublasHandle; }
-
-    std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
-    {
-        uint64_t offset = _curr_offset;
-        _curr_offset += offset_inc;
-        return std::pair<uint64_t, uint64_t>(_seed, offset);
-    }
-
-    void SetSeed(uint64_t new_seed) { _seed = new_seed; }
-
-    const std::vector<std::array<int, 3>>& GetGemmAlgos() const { return _gemm_algos; }
-
-    inline void SynchComp()
-    {
-        hipEventRecord(_comp_event, _comp_stream);
-        hipStreamWaitEvent(_comm_stream, _comp_event, 0);
-    }
-    inline void SynchComm()
-    {
-        hipEventRecord(_comm_event, _comm_stream);
-        hipStreamWaitEvent(_comp_stream, _comm_event, 0);
-    }
-
-private:
-    hiprandGenerator_t _gen;
-    rocblas_handle _cublasHandle;
-
-    hipEvent_t _comp_event;
-    hipEvent_t _comm_event;
-
-    void* _workspace;
-    uint64_t _seed;
-    uint64_t _curr_offset;
-    size_t _workSpaceSize;
-
-    hipEvent_t _comp1_event;
-    hipEvent_t _comp2_event;
-
-    hipStream_t _stream;
-
-    unsigned _token_length;
-    unsigned _num_tokens;
-    std::vector<std::array<int, 3>> _gemm_algos;
-
-    hipStream_t _comp_stream;
-    hipStream_t _comm_stream;
-
-    std::unordered_map<int, int> _world_sizes;
-};
diff --git a/deepspeed/ops/csrc/transformer_bak/inference/includes/cublas_wrappers.h b/deepspeed/ops/csrc/transformer_bak/inference/includes/cublas_wrappers.h
deleted file mode 100644
index 3addd0291f03cb46a50cc21fcbfd4e22af6929ef..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/inference/includes/cublas_wrappers.h
+++ /dev/null
@@ -1,207 +0,0 @@
-#pragma once
-
-#include <assert.h>
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <mma.h>
-#include <stdio.h>
-
-int cublas_gemm_ex(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const float* A,
-                   const float* B,
-                   float* C,
-                   cublasGemmAlgo_t algo)
-{
-    cublasStatus_t status = cublasGemmEx(handle,
-                                         transa,
-                                         transb,
-                                         m,
-                                         n,
-                                         k,
-                                         (const void*)alpha,
-                                         (const void*)A,
-                                         CUDA_R_32F,
-                                         (transa == CUBLAS_OP_N) ? m : k,
-                                         (const void*)B,
-                                         CUDA_R_32F,
-                                         (transb == CUBLAS_OP_N) ? k : n,
-                                         (const void*)beta,
-                                         C,
-                                         CUDA_R_32F,
-                                         m,
-                                         CUDA_R_32F,
-                                         algo);
-
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-int cublas_gemm_ex(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-                   cublasGemmAlgo_t algo)
-{
-    cublasStatus_t status = cublasGemmEx(handle,
-                                         transa,
-                                         transb,
-                                         m,
-                                         n,
-                                         k,
-                                         (const void*)alpha,
-                                         (const void*)A,
-                                         CUDA_R_16F,
-                                         (transa == CUBLAS_OP_N) ? m : k,
-                                         (const void*)B,
-                                         CUDA_R_16F,
-                                         (transb == CUBLAS_OP_N) ? k : n,
-                                         (const void*)beta,
-                                         (void*)C,
-                                         CUDA_R_16F,
-                                         m,
-                                         CUDA_R_32F,
-                                         algo);
-
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-int cublas_strided_batched_gemm(cublasHandle_t handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const float* A,
-                                const float* B,
-                                float* C,
-                                cublasOperation_t op_A,
-                                cublasOperation_t op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                cublasGemmAlgo_t algo)
-{
-    cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
-                                                       op_A,
-                                                       op_B,
-                                                       m,
-                                                       n,
-                                                       k,
-                                                       alpha,
-                                                       A,
-                                                       CUDA_R_32F,
-                                                       (op_A == CUBLAS_OP_N) ? m : k,
-                                                       stride_A,
-                                                       B,
-                                                       CUDA_R_32F,
-                                                       (op_B == CUBLAS_OP_N) ? k : n,
-                                                       stride_B,
-                                                       beta,
-                                                       C,
-                                                       CUDA_R_32F,
-                                                       m,
-                                                       stride_C,
-                                                       batch,
-                                                       CUDA_R_32F,
-                                                       algo);
-
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        fprintf(stderr,
-                "!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d) \n",
-                batch,
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-int cublas_strided_batched_gemm(cublasHandle_t handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
-                                cublasOperation_t op_A,
-                                cublasOperation_t op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                cublasGemmAlgo_t algo)
-{
-    cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
-                                                       op_A,
-                                                       op_B,
-                                                       m,
-                                                       n,
-                                                       k,
-                                                       alpha,
-                                                       A,
-                                                       CUDA_R_16F,
-                                                       (op_A == CUBLAS_OP_N) ? m : k,
-                                                       stride_A,
-                                                       B,
-                                                       CUDA_R_16F,
-                                                       (op_B == CUBLAS_OP_N) ? k : n,
-                                                       stride_B,
-                                                       beta,
-                                                       C,
-                                                       CUDA_R_16F,
-                                                       m,
-                                                       stride_C,
-                                                       batch,
-                                                       CUDA_R_32F,
-                                                       algo);
-
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-
-    return 0;
-}
diff --git a/deepspeed/ops/csrc/transformer_bak/inference/includes/cublas_wrappers_hip.h b/deepspeed/ops/csrc/transformer_bak/inference/includes/cublas_wrappers_hip.h
deleted file mode 100644
index 285e5befdedccfca5d09b72cfd73dfc1ef002f5d..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/inference/includes/cublas_wrappers_hip.h
+++ /dev/null
@@ -1,208 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#include <assert.h>
-#include <rocblas.h>
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime.h>
-#include <mma.h>
-#include <stdio.h>
-
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const float* A,
-                   const float* B,
-                   float* C,
-                   cublasGemmAlgo_t algo)
-{
-    rocblas_status status = rocblas_gemmex(handle,
-                                         transa,
-                                         transb,
-                                         m,
-                                         n,
-                                         k,
-                                         (const void*)alpha,
-                                         (const void*)A,
-                                         hipR32F,
-                                         (transa == rocblas_operation_none) ? m : k,
-                                         (const void*)B,
-                                         hipR32F,
-                                         (transb == rocblas_operation_none) ? k : n,
-                                         (const void*)beta,
-                                         C,
-                                         hipR32F,
-                                         m,
-                                         hipR32F,
-                                         algo);
-
-    if (status != rocblas_status_success) {
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-                   cublasGemmAlgo_t algo)
-{
-    rocblas_status status = rocblas_gemmex(handle,
-                                         transa,
-                                         transb,
-                                         m,
-                                         n,
-                                         k,
-                                         (const void*)alpha,
-                                         (const void*)A,
-                                         hipR16F,
-                                         (transa == rocblas_operation_none) ? m : k,
-                                         (const void*)B,
-                                         hipR16F,
-                                         (transb == rocblas_operation_none) ? k : n,
-                                         (const void*)beta,
-                                         (void*)C,
-                                         hipR16F,
-                                         m,
-                                         hipR32F,
-                                         algo);
-
-    if (status != rocblas_status_success) {
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const float* A,
-                                const float* B,
-                                float* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                cublasGemmAlgo_t algo)
-{
-    rocblas_status status = cublasGemmStridedBatchedEx(handle,
-                                                       op_A,
-                                                       op_B,
-                                                       m,
-                                                       n,
-                                                       k,
-                                                       alpha,
-                                                       A,
-                                                       hipR32F,
-                                                       (op_A == rocblas_operation_none) ? m : k,
-                                                       stride_A,
-                                                       B,
-                                                       hipR32F,
-                                                       (op_B == rocblas_operation_none) ? k : n,
-                                                       stride_B,
-                                                       beta,
-                                                       C,
-                                                       hipR32F,
-                                                       m,
-                                                       stride_C,
-                                                       batch,
-                                                       hipR32F,
-                                                       algo);
-
-    if (status != rocblas_status_success) {
-        fprintf(stderr,
-                "!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d) \n",
-                batch,
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                cublasGemmAlgo_t algo)
-{
-    rocblas_status status = cublasGemmStridedBatchedEx(handle,
-                                                       op_A,
-                                                       op_B,
-                                                       m,
-                                                       n,
-                                                       k,
-                                                       alpha,
-                                                       A,
-                                                       hipR16F,
-                                                       (op_A == rocblas_operation_none) ? m : k,
-                                                       stride_A,
-                                                       B,
-                                                       hipR16F,
-                                                       (op_B == rocblas_operation_none) ? k : n,
-                                                       stride_B,
-                                                       beta,
-                                                       C,
-                                                       hipR16F,
-                                                       m,
-                                                       stride_C,
-                                                       batch,
-                                                       hipR32F,
-                                                       algo);
-
-    if (status != rocblas_status_success) {
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-
-    return 0;
-}
diff --git a/deepspeed/ops/csrc/transformer_bak/inference/includes/custom_cuda_layers.h b/deepspeed/ops/csrc/transformer_bak/inference/includes/custom_cuda_layers.h
deleted file mode 100644
index 06b4340061c98c65b4b301c7349d2da03185f715..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/inference/includes/custom_cuda_layers.h
+++ /dev/null
@@ -1,124 +0,0 @@
-#pragma once
-
-#ifdef __HIP_PLATFORM_HCC__
-#define HALF_PRECISION_AVAILABLE = 1
-#include <hip/hip_cooperative_groups.h>
-#else
-#if __CUDA_ARCH__ >= 700
-#define HALF_PRECISION_AVAILABLE = 1
-#endif
-#include <cooperative_groups.h>
-#endif
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <cassert>
-#include <iostream>
-
-#define MAX_WARP_NUM 32
-#define WARP_SIZE 32
-#define SMs 80
-
-#define MAX_REGISTERS 256
-template <typename T>
-void launch_attn_softmax_v2(T* vals,
-                            T* mask,
-                            bool triangular,
-                            bool recompute,
-                            bool local_attention,
-                            int window_size,
-                            int batch_size,
-                            int heads,
-                            int num_seq,
-                            int sequence_length,
-                            float scale,
-                            cudaStream_t stream);
-
-// Fused bias add with gelu activation
-template <typename T>
-void launch_bias_gelu(T* input,
-                      const T* bias,
-                      int intermediate_size,
-                      int batch_size,
-                      cudaStream_t stream);
-template <typename T>
-void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, cudaStream_t stream);
-
-template <typename T>
-void launch_bias_residual(T* input,
-                          T* output,
-                          T* attn,
-                          T* bias,
-                          T* attn_bias,
-                          int batch,
-                          int hidden_dim,
-                          int mp_size,
-                          cudaStream_t stream);
-
-template <typename T>
-void launch_layer_norm(T* out,
-                       T* vals,
-                       const T* gamma,
-                       const T* beta,
-                       float epsilon,
-                       int batch_size,
-                       int hidden_dim,
-                       cudaStream_t stream);
-
-template <typename T>
-void launch_residual_layer_norm(T* norm,
-                                T* res_add,
-                                T* vals,
-                                T* residual,
-                                const T* bias,
-                                const T* gamma,
-                                const T* beta,
-                                float epsilon,
-                                int batch_size,
-                                int hidden_dim,
-                                bool preLN,
-                                bool mlp_after_attn,
-                                cudaStream_t stream);
-template <typename T>
-void launch_dequantize(T* output,
-                       const int8_t* input,
-                       const float* qscale,
-                       unsigned output_size,
-                       unsigned hidden_dim,
-                       unsigned groups,
-                       unsigned merge_count,
-                       cudaStream_t stream);
-
-template <typename T>
-void launch_gptj_residual_add(T* input,
-                              T* output,
-                              T* attn,
-                              T* bias,
-                              T* attn_bias,
-                              int batch,
-                              int head_size,
-                              int mp_size,
-                              cudaStream_t stream);
-
-template <typename T>
-void launch_apply_rotary_pos_emb(T* mixed_query,
-                                 T* key_layer,
-                                 unsigned head_size,
-                                 unsigned seq_len,
-                                 unsigned rotary_dim,
-                                 unsigned offset,
-                                 unsigned num_heads,
-                                 unsigned batch,
-                                 bool rotate_half,
-                                 bool rotate_every_two,
-                                 cudaStream_t stream);
-
-template <typename T>
-void launch_moe_res_matmul(T* residual,
-                           T* coef,
-                           T* mlp_out,
-                           int seq_len,
-                           int hidden_dim,
-                           cudaStream_t stream);
diff --git a/deepspeed/ops/csrc/transformer_bak/inference/includes/custom_hip_layers.h b/deepspeed/ops/csrc/transformer_bak/inference/includes/custom_hip_layers.h
deleted file mode 100644
index 36cab34d6262f5d6211a18584f6d55284c04846e..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/inference/includes/custom_hip_layers.h
+++ /dev/null
@@ -1,125 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#ifdef __HIP_PLATFORM_HCC__
-#define HALF_PRECISION_AVAILABLE = 1
-#include <hip/hip_cooperative_groups.h>
-#else
-#if __CUDA_ARCH__ >= 700
-#define HALF_PRECISION_AVAILABLE = 1
-#endif
-#include <cooperative_groups.h>
-#endif
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <cassert>
-#include <iostream>
-
-#define MAX_WARP_NUM 32
-#define WARP_SIZE 32
-#define SMs 80
-
-#define MAX_REGISTERS 256
-template <typename T>
-void launch_attn_softmax_v2(T* vals,
-                            T* mask,
-                            bool triangular,
-                            bool recompute,
-                            bool local_attention,
-                            int window_size,
-                            int batch_size,
-                            int heads,
-                            int num_seq,
-                            int sequence_length,
-                            float scale,
-                            hipStream_t stream);
-
-// Fused bias add with gelu activation
-template <typename T>
-void launch_bias_gelu(T* input,
-                      const T* bias,
-                      int intermediate_size,
-                      int batch_size,
-                      hipStream_t stream);
-template <typename T>
-void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, hipStream_t stream);
-
-template <typename T>
-void launch_bias_residual(T* input,
-                          T* output,
-                          T* attn,
-                          T* bias,
-                          T* attn_bias,
-                          int batch,
-                          int hidden_dim,
-                          int mp_size,
-                          hipStream_t stream);
-
-template <typename T>
-void launch_layer_norm(T* out,
-                       T* vals,
-                       const T* gamma,
-                       const T* beta,
-                       float epsilon,
-                       int batch_size,
-                       int hidden_dim,
-                       hipStream_t stream);
-
-template <typename T>
-void launch_residual_layer_norm(T* norm,
-                                T* res_add,
-                                T* vals,
-                                T* residual,
-                                const T* bias,
-                                const T* gamma,
-                                const T* beta,
-                                float epsilon,
-                                int batch_size,
-                                int hidden_dim,
-                                bool preLN,
-                                bool mlp_after_attn,
-                                hipStream_t stream);
-template <typename T>
-void launch_dequantize(T* output,
-                       const int8_t* input,
-                       const float* qscale,
-                       unsigned output_size,
-                       unsigned hidden_dim,
-                       unsigned groups,
-                       unsigned merge_count,
-                       hipStream_t stream);
-
-template <typename T>
-void launch_gptj_residual_add(T* input,
-                              T* output,
-                              T* attn,
-                              T* bias,
-                              T* attn_bias,
-                              int batch,
-                              int head_size,
-                              int mp_size,
-                              hipStream_t stream);
-
-template <typename T>
-void launch_apply_rotary_pos_emb(T* mixed_query,
-                                 T* key_layer,
-                                 unsigned head_size,
-                                 unsigned seq_len,
-                                 unsigned rotary_dim,
-                                 unsigned offset,
-                                 unsigned num_heads,
-                                 unsigned batch,
-                                 bool rotate_half,
-                                 bool rotate_every_two,
-                                 hipStream_t stream);
-
-template <typename T>
-void launch_moe_res_matmul(T* residual,
-                           T* coef,
-                           T* mlp_out,
-                           int seq_len,
-                           int hidden_dim,
-                           hipStream_t stream);
diff --git a/deepspeed/ops/csrc/transformer_bak/normalize_kernels.cu b/deepspeed/ops/csrc/transformer_bak/normalize_kernels.cu
deleted file mode 100644
index d634c7f1b2cd1c2632495d8e1f3b47b45867c353..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/normalize_kernels.cu
+++ /dev/null
@@ -1,2121 +0,0 @@
-#include "custom_cuda_layers.h"
-
-namespace cg = cooperative_groups;
-
-/*
-Fused bias add, residual (elementwise) add, and normalization layer.
-
-For FP16, this kernel does not promote to FP32 in order to utilize the 2x throughput for
-__half2 instructions, and avoid the conversion overhead (1/8 of __hal2 arithmetic).
-
-For specific launch constraints, see the launch functions.
-*/
-
-#define NORM_REG (MAX_REGISTERS / 4)
-
-__global__ void fused_bias_residual_layer_norm(float* vals,
-                                               const float* residual,
-                                               const float* gamma,
-                                               const float* beta,
-                                               float epsilon,
-                                               bool preLayerNorm,
-                                               bool training,
-                                               float* vars,
-                                               float* means,
-                                               int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id / WARP_SIZE;
-
-    float vals_arr[NORM_REG];
-    __shared__ float shr[MAX_WARP_NUM];
-
-    residual += (row * row_stride);
-    vals += (row * row_stride);
-
-    float sum = 0.f;
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = residual[i * iteration_stride + id];
-        sum += vals_arr[i];
-    }
-    if (high_index < row_stride) {
-        vals_arr[iterations] = residual[high_index];
-        sum += vals_arr[iterations];
-        iterations++;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
-
-#if !defined(__STOCHASTIC_MODE__) || __CUDA_ARCH__ < 700
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        sum += g.shfl_down(sum, i);
-    }
-
-    sum = g.shfl(sum, 0);
-    float mean = sum / row_stride;
-    if (training)
-        if (threadIdx.x == 0) means[row] = mean;
-    float variance = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] -= mean;
-        variance += vals_arr[i] * vals_arr[i];
-    }
-
-    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = variance;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        variance += g.shfl_down(variance, i);
-    }
-    variance = g.shfl(variance, 0);
-    variance /= row_stride;
-    variance += epsilon;
-    if (training)
-        if (threadIdx.x == 0) vars[row] = variance;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = vals_arr[i] * rsqrtf(variance);
-        vals_arr[i] =
-            vals_arr[i] * gamma[i * iteration_stride + id] + beta[i * iteration_stride + id];
-        vals[i * iteration_stride + id] = vals_arr[i];
-    }
-    if ((high_index) < row_stride) {
-        vals_arr[iterations] = vals_arr[iterations] * rsqrtf(variance);
-        vals_arr[iterations] = vals_arr[iterations] * gamma[high_index] + beta[high_index];
-        vals[high_index] = vals_arr[iterations];
-    }
-}
-
-__global__ void fused_bias_residual_layer_norm(__half* vals,
-                                               const __half* residual,
-                                               const __half* gamma,
-                                               const __half* beta,
-                                               float epsilon,
-                                               bool preLayerNorm,
-                                               bool training,
-                                               __half* vars,
-                                               __half* means,
-                                               int row_stride)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> WARP_SIZE_BITS;
-
-    float2 vals_f[NORM_REG];
-    __shared__ float shr[MAX_WARP_NUM];
-
-    __half2* vals_cast = reinterpret_cast<__half2*>(vals);
-    const __half2* residual_cast = reinterpret_cast<const __half2*>(residual);
-
-    residual_cast += (row * row_stride);
-    vals_cast += (row * row_stride);
-
-    float sum = 0.f;
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        vals_f[i] = __half22float2(residual_cast[i * iteration_stride + id]);
-        sum += vals_f[i].x;
-        sum += vals_f[i].y;
-    }
-    if ((high_index) < row_stride) {
-        vals_f[iterations] = __half22float2(residual_cast[high_index]);
-        sum += vals_f[iterations].x;
-        sum += vals_f[iterations].y;
-        iterations++;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        sum += g.shfl_down(sum, i);
-    }
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride * 2);
-
-    float variance = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        vals_f[i].x -= mean;
-        vals_f[i].y -= mean;
-        variance += vals_f[i].x * vals_f[i].x;
-        variance += vals_f[i].y * vals_f[i].y;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = variance;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        variance += g.shfl_down(variance, i);
-    }
-    variance = g.shfl(variance, 0);
-    variance /= (row_stride * 2);
-    variance += epsilon;
-
-    __half2 variance_h = __float2half2_rn(variance);
-    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
-    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
-
-    if (training && threadIdx.x == 0) {
-        vars[row] = __float2half(variance);
-        means[row] = __float2half(mean);
-    }
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        __half2 vals_arr = __float22half2_rn(vals_f[i]);
-        vals_arr = vals_arr * h2rsqrt(variance_h);
-        vals_arr =
-            vals_arr * gamma_cast[i * iteration_stride + id] + beta_cast[i * iteration_stride + id];
-        vals_cast[i * iteration_stride + id] = vals_arr;
-    }
-    if ((high_index) < row_stride) {
-        __half2 vals_arr = __float22half2_rn(vals_f[iterations]);
-        vals_arr = vals_arr * h2rsqrt(variance_h);
-        vals_arr = vals_arr * gamma_cast[high_index] + beta_cast[high_index];
-        vals_cast[high_index] = vals_arr;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_residual_layer_norm(T* vals,
-                                     const T* residual,
-                                     const T* gamma,
-                                     const T* beta,
-                                     float epsilon,
-                                     int batch_size,
-                                     int hidden_dim,
-                                     cudaStream_t stream,
-                                     bool preLayerNorm,
-                                     bool training,
-                                     T* vars,
-                                     T* means);
-
-template <>
-void launch_bias_residual_layer_norm<float>(float* vals,
-                                            const float* residual,
-                                            const float* gamma,
-                                            const float* beta,
-                                            float epsilon,
-                                            int batch_size,
-                                            int hidden_dim,
-                                            cudaStream_t stream,
-                                            bool preLayerNorm,
-                                            bool training,
-                                            float* vars,
-                                            float* means)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(batch_size);
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim(threads);
-
-    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
-        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, hidden_dim);
-}
-
-template <>
-void launch_bias_residual_layer_norm<__half>(__half* vals,
-                                             const __half* residual,
-                                             const __half* gamma,
-                                             const __half* beta,
-                                             float epsilon,
-                                             int batch_size,
-                                             int hidden_dim,
-                                             cudaStream_t stream,
-                                             bool preLayerNorm,
-                                             bool training,
-                                             __half* vars,
-                                             __half* means)
-{
-    int threads = 128;
-
-    dim3 grid_dim(batch_size);
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim(threads);
-
-    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
-        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, hidden_dim / 2);
-}
-
-__global__ void fused_bias_residual_layer_norm(float* vals,
-                                               const float* residual,
-                                               const float* gamma,
-                                               const float* beta,
-                                               float epsilon,
-                                               bool preLayerNorm,
-                                               bool training,
-                                               float* vars,
-                                               int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id / 32;
-
-    float vals_arr[NORM_REG];
-    __shared__ float shr[MAX_WARP_NUM];
-
-    residual += (row * row_stride);
-    vals += (row * row_stride);
-
-    float sum = 0.f;
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = residual[i * iteration_stride + id];
-        sum += vals_arr[i];
-    }
-    if ((high_index) < row_stride) {
-        vals_arr[iterations] = residual[high_index];
-        sum += vals_arr[iterations];
-        iterations++;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
-
-#if !defined(__STOCHASTIC_MODE__) || __CUDA_ARCH__ < 700
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        sum += g.shfl_down(sum, i);
-    }
-
-    sum = g.shfl(sum, 0);
-    float mean = sum / row_stride;
-    float variance = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] -= mean;
-        variance += vals_arr[i] * vals_arr[i];
-    }
-
-    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = variance;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        variance += g.shfl_down(variance, i);
-    }
-    variance = g.shfl(variance, 0);
-    variance /= row_stride;
-    variance += epsilon;
-    if (training)
-        if (threadIdx.x == 0) vars[row] = variance;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = vals_arr[i] * rsqrtf(variance);
-        vals_arr[i] =
-            vals_arr[i] * gamma[i * iteration_stride + id] + beta[i * iteration_stride + id];
-        vals[i * iteration_stride + id] = vals_arr[i];
-    }
-    if ((high_index) < row_stride) {
-        vals_arr[iterations] = vals_arr[iterations] * rsqrtf(variance);
-        vals_arr[iterations] = vals_arr[iterations] * gamma[high_index] + beta[high_index];
-        vals[high_index] = vals_arr[iterations];
-    }
-}
-
-__global__ void fused_bias_residual_layer_norm(__half* vals,
-                                               const __half* residual,
-                                               const __half* gamma,
-                                               const __half* beta,
-                                               float epsilon,
-                                               bool preLayerNorm,
-                                               bool training,
-                                               __half* vars,
-                                               int row_stride)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> WARP_SIZE_BITS;
-
-    float2 vals_f[NORM_REG];
-    __shared__ float shr[MAX_WARP_NUM];
-
-    __half2* vals_cast = reinterpret_cast<__half2*>(vals);
-    const __half2* residual_cast = reinterpret_cast<const __half2*>(residual);
-
-    residual_cast += (row * row_stride);
-    vals_cast += (row * row_stride);
-
-    float sum = 0.f;
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        vals_f[i] = __half22float2(residual_cast[i * iteration_stride + id]);
-        sum += vals_f[i].x;
-        sum += vals_f[i].y;
-    }
-    if ((high_index) < row_stride) {
-        vals_f[iterations] = __half22float2(residual_cast[high_index]);
-        sum += vals_f[iterations].x;
-        sum += vals_f[iterations].y;
-        iterations++;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        sum += g.shfl_down(sum, i);
-    }
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride * 2);
-
-    float variance = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        vals_f[i].x -= mean;
-        vals_f[i].y -= mean;
-        variance += vals_f[i].x * vals_f[i].x;
-        variance += vals_f[i].y * vals_f[i].y;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = variance;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        variance += g.shfl_down(variance, i);
-    }
-    variance = g.shfl(variance, 0);
-    variance /= (row_stride * 2);
-    variance += epsilon;
-
-    __half2 variance_h = __float2half2_rn(variance);
-    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
-    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
-
-    if (training && threadIdx.x == 0) vars[row] = __float2half(variance);
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        __half2 vals_arr = __float22half2_rn(vals_f[i]);
-        vals_arr = vals_arr * h2rsqrt(variance_h);
-        vals_arr =
-            vals_arr * gamma_cast[i * iteration_stride + id] + beta_cast[i * iteration_stride + id];
-        vals_cast[i * iteration_stride + id] = vals_arr;
-    }
-    if ((high_index) < row_stride) {
-        __half2 vals_arr = __float22half2_rn(vals_f[iterations]);
-        vals_arr = vals_arr * h2rsqrt(variance_h);
-        vals_arr = vals_arr * gamma_cast[high_index] + beta_cast[high_index];
-        vals_cast[high_index] = vals_arr;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_residual_layer_norm(T* vals,
-                                     const T* residual,
-                                     const T* gamma,
-                                     const T* beta,
-                                     float epsilon,
-                                     int batch_size,
-                                     int hidden_dim,
-                                     cudaStream_t stream,
-                                     bool preLayerNorm,
-                                     bool training,
-                                     T* vars);
-
-/*
-To tune this launch the following restrictions must be met:
-
-For float:
-row_stride == hidden_size
-threads * iterations == row_stride
-threads is in [32, 64, 128, 256, 512, 1024]
-
-For half:
-row_stride == hidden_size / 2
-threads * iterations == row_stride
-threads is in [32, 64, 128, 256, 512, 1024]
-
-*/
-
-template <>
-void launch_bias_residual_layer_norm<float>(float* vals,
-                                            const float* residual,
-                                            const float* gamma,
-                                            const float* beta,
-                                            float epsilon,
-                                            int batch_size,
-                                            int hidden_dim,
-                                            cudaStream_t stream,
-                                            bool preLayerNorm,
-                                            bool training,
-                                            float* vars)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(batch_size);
-
-    // There are some limitations to call below functions, now just enumerate the situations.
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim(threads);
-
-    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
-        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, hidden_dim);
-}
-
-template <>
-void launch_bias_residual_layer_norm<__half>(__half* vals,
-                                             const __half* residual,
-                                             const __half* gamma,
-                                             const __half* beta,
-                                             float epsilon,
-                                             int batch_size,
-                                             int hidden_dim,
-                                             cudaStream_t stream,
-                                             bool preLayerNorm,
-                                             bool training,
-                                             __half* vars)
-{
-    int threads = 128;
-
-    dim3 grid_dim(batch_size);
-
-    // There are some limitations to call below functions, now just enumerate the situations.
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim(threads);
-    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
-        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, hidden_dim / 2);
-}
-
-/* Normalize Gamma & Betta gradients
- * Compute gradients using either X_hat or
- * normalize input (invertible).
- * Combine transpose with gradients computation.
- */
-
-template <typename T>
-__global__ void LayerNormBackward1(const T* __restrict__ out_grad,
-                                   const T* __restrict__ vals_hat,
-                                   const T* __restrict__ gamma,
-                                   const T* __restrict__ betta,
-                                   T* __restrict__ gamma_grad,
-                                   T* __restrict__ betta_grad,
-                                   int rows,
-                                   int width,
-                                   bool invertible)
-{
-    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
-    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int offset = threadIdx.y * width + idx;
-    int y_stride = width * TILE_DIM;
-
-    float betta_reg = (invertible ? (float)betta[idx] : 0.0f);
-    float gamma_reg = (float)gamma[idx];
-
-    // Loop across matrix height
-    float betta_tmp = 0;
-    float gamma_tmp = 0;
-    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-        float grad = (float)out_grad[offset];
-        float val = (invertible ? ((float)vals_hat[offset] - betta_reg) / gamma_reg
-                                : (float)vals_hat[offset]);
-        betta_tmp += grad;
-        gamma_tmp += (val * grad);
-
-        offset += y_stride;
-    }
-
-    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
-    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
-    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) {
-        s1 += g.shfl_down(s1, i);
-        s2 += g.shfl_down(s2, i);
-    }
-
-    if (threadIdx.x == 0) {
-        int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-        betta_grad[pos] = s1;
-        gamma_grad[pos] = s2;
-    }
-}
-
-/* Normalize Gamma & Betta gradients
- * Compute gradients using the input to
- * the normalize.
- * Combine transpose with gradients computation.
- */
-
-template <typename T>
-__global__ void LayerNormBackward1(const T* __restrict__ out_grad,
-                                   const T* __restrict__ X_data,
-                                   const T* __restrict__ vars,
-                                   const T* __restrict__ means,
-                                   T* __restrict__ gamma_grad,
-                                   T* __restrict__ betta_grad,
-                                   int rows,
-                                   int width)
-{
-    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
-    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int offset = threadIdx.y * width + idx;
-    int y_stride = width * TILE_DIM;
-
-    int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-    // Loop across matrix height
-
-    float betta_tmp = 0;
-    float gamma_tmp = 0;
-    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-        float grad = (float)out_grad[offset];
-        float val = (float)X_data[offset];
-        val = (val - (float)means[r]) * rsqrtf((float)vars[r]);
-        betta_tmp += grad;
-        gamma_tmp += (val * grad);
-
-        offset += y_stride;
-    }
-
-    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
-    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
-    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) {
-        s1 += g.shfl_down(s1, i);
-        s2 += g.shfl_down(s2, i);
-    }
-
-    if (threadIdx.x == 0) {
-        betta_grad[pos] = s1;
-        gamma_grad[pos] = s2;
-    }
-}
-/*
-
-/* Backward Normalize (Input-Gradient)
- * Using the means and variances from the input
- * This type of backward is invertible!
- * We do the backward using the X_hat (X - u) / sqrt(variance) or the output of Normalization.
- */
-
-__global__ void LayerNormBackward2(const float* out_grad,
-                                   const float* vals_hat,
-                                   const float* gamma,
-                                   const float* betta,
-                                   const float* vars,
-                                   float* inp_grad,
-                                   bool invertible,
-                                   int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    out_grad += (row * row_stride);
-    vals_hat += (row * row_stride);
-    inp_grad += (row * row_stride);
-
-    float vals_arr[NORM_REG];
-    float vals_hat_arr[NORM_REG];
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        float gamma_reg = gamma[i * iteration_stride + id];
-        vals_arr[i] = out_grad[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;
-        vals_hat_arr[i] =
-            (invertible ? (vals_hat[i * iteration_stride + id] - betta[i * iteration_stride + id]) /
-                              gamma_reg
-                        : vals_hat[i * iteration_stride + id]);
-    }
-    if ((high_index) < row_stride) {
-        float gamma_reg = gamma[high_index];
-        vals_arr[iterations] = out_grad[high_index];
-        vals_arr[iterations] *= gamma_reg;
-        vals_hat_arr[iterations] =
-            (invertible ? (vals_hat[high_index] - betta[high_index]) / gamma_reg
-                        : vals_hat[high_index]);
-        iterations++;
-    }
-
-    float var_reg = vars[row];
-
-    float sum = 0;
-    for (int i = 0; i < iterations; i++) {
-        sum += vals_hat_arr[i] * vals_arr[i] *
-               sqrtf(var_reg);           // dval_hat = gamma * (x - u) * out_grad
-        vals_arr[i] *= rsqrtf(var_reg);  // dvar_inv = gamma * out_grad / sqrt(var)
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    for (int i = 0; i < iterations; i++) { vals_arr[i] += ((-sum * vals_hat_arr[i]) / var_reg); }
-
-    sum = 0;
-    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) inp_grad[i * iteration_stride + id] = (vals_arr[i] - sum);
-    if ((high_index) < row_stride) inp_grad[high_index] = (vals_arr[iterations] - sum);
-}
-
-__global__ void LayerNormBackward2(const __half* out_grad,
-                                   const __half* vals_hat,
-                                   const __half* gamma,
-                                   const __half* betta,
-                                   const __half* vars,
-                                   __half* inp_grad,
-                                   bool invertible,
-                                   int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    __half2 vals_arr[NORM_REG];
-    float2 vals_arr_f[NORM_REG];
-    __half2 vals_hat_arr[NORM_REG];
-
-    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
-    const __half2* out_grad_h = reinterpret_cast<const __half2*>(out_grad);
-    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(vals_hat);
-
-    inp_grad_h += (row * row_stride);
-    out_grad_h += (row * row_stride);
-    vals_hat_h += (row * row_stride);
-
-    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
-    const __half2* betta_h = (invertible ? reinterpret_cast<const __half2*>(betta) : nullptr);
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
-        vals_arr[i] = out_grad_h[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;
-        vals_hat_arr[i] =
-            (invertible
-                 ? (vals_hat_h[i * iteration_stride + id] - betta_h[i * iteration_stride + id]) /
-                       gamma_reg
-                 : vals_hat_h[i * iteration_stride + id]);
-    }
-    if ((high_index) < row_stride) {
-        __half2 gamma_reg = gamma_h[high_index];
-        vals_arr[iterations] = out_grad_h[high_index];
-        vals_arr[iterations] *= gamma_reg;
-        vals_hat_arr[iterations] =
-            (invertible ? (vals_hat_h[high_index] - betta_h[high_index]) / gamma_reg
-                        : vals_hat_h[high_index]);
-        iterations++;
-    }
-    __half var_h = vars[row];
-    __half2 var_reg = __halves2half2(var_h, var_h);
-
-    float sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        __half2 result_h = (vals_hat_arr[i] * vals_arr[i] * h2sqrt(var_reg));
-        float2 result_f = __half22float2(result_h);
-        sum += result_f.x;
-        sum += result_f.y;
-        vals_arr[i] *= h2rsqrt(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-    __half2 sum_h = __float2half2_rn(sum);
-
-    for (int i = 0; i < iterations; i++) {
-        __half2 temp = ((-sum_h * vals_hat_arr[i]) / (var_reg));
-        vals_arr_f[i] = __half22float2(vals_arr[i]);
-        float2 temp_f = __half22float2(temp);
-        vals_arr_f[i].x += temp_f.x;
-        vals_arr_f[i].y += temp_f.y;
-    }
-    sum = 0.f;
-
-    for (int i = 0; i < iterations; i++) {
-        sum += (vals_arr_f[i].x);
-        sum += (vals_arr_f[i].y);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr_f[i].x -= sum;
-        vals_arr_f[i].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[i]);
-
-        inp_grad_h[i * iteration_stride + id] = temp;
-    }
-    if ((high_index) < row_stride) {
-        vals_arr_f[iterations].x -= sum;
-        vals_arr_f[iterations].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
-
-        inp_grad_h[high_index] = temp;
-    }
-}
-
-template <>
-void launch_layerNorm_backward<float>(const float* out_grad,
-                                      const float* vals_hat,
-                                      const float* vars,
-                                      const float* gamma,
-                                      float* gamma_grad,
-                                      float* betta_grad,
-                                      float* inp_grad,
-                                      int batch,
-                                      int hidden_dim,
-                                      cudaStream_t stream[2],
-                                      bool invertible,
-                                      const float* betta)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    LayerNormBackward1<float><<<grid_dim, block_dim, 0, stream[0]>>>(
-        out_grad, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads);
-
-    LayerNormBackward2<<<grid_dim2, block_dim2, 0, stream[1]>>>(
-        out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim);
-}
-
-template <>
-void launch_layerNorm_backward<__half>(const __half* out_grad,
-                                       const __half* vals_hat,
-                                       const __half* vars,
-                                       const __half* gamma,
-                                       __half* gamma_grad,
-                                       __half* betta_grad,
-                                       __half* inp_grad,
-                                       int batch,
-                                       int hidden_dim,
-                                       cudaStream_t stream[2],
-                                       bool invertible,
-                                       const __half* betta)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    // LayerNormBackward1<__half><<<grid_dim, block_dim, 0, stream[0]>>>(
-    //    out_grad, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads / 2);
-
-    LayerNormBackward2<<<grid_dim2, block_dim2, 0, stream[1]>>>(
-        out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim / 2);
-}
-
-/* Backward Normalize (Input-Gradient)
- * Using the means and variances from the input
- * This type of backward is not invertible!
- * We do the backward using the input (X)
- */
-
-__global__ void LayerNormBackward2(const float* out_grad,
-                                   const float* X_vals,
-                                   const float* gamma,
-                                   const float* vars,
-                                   const float* means,
-                                   float* inp_grad,
-                                   int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id >> WARP_SIZE_BITS;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    out_grad += (row * row_stride);
-    X_vals += (row * row_stride);
-    inp_grad += (row * row_stride);
-
-    float vals_arr[NORM_REG];
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        float gamma_reg = gamma[i * iteration_stride + id];
-        vals_arr[i] = out_grad[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;
-    }
-    if ((high_index) < row_stride) {
-        float gamma_reg = gamma[high_index];
-        vals_arr[iterations] = out_grad[high_index];
-        vals_arr[iterations] *= gamma_reg;
-        iterations++;
-    }
-
-    float var_reg = vars[row];
-    float mean_reg = means[row];
-
-    float sum = 0;
-    float xu[NORM_REG];
-    for (int i = 0; i < iterations; i++) {
-        xu[i] = (X_vals[i * iteration_stride + id] - mean_reg);
-        sum += vals_arr[i] * xu[i];
-        vals_arr[i] *= rsqrtf(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] += (-sum * xu[i] * rsqrtf(var_reg) / (var_reg));
-    }
-
-    sum = 0;
-    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) inp_grad[i * iteration_stride + id] = (vals_arr[i] - sum);
-    if ((high_index) < row_stride) inp_grad[high_index] = (vals_arr[iterations] - sum);
-}
-
-__global__ void LayerNormBackward2(const __half* out_grad,
-                                   const __half* X_vals,
-                                   const __half* gamma,
-                                   const __half* vars,
-                                   const __half* means,
-                                   __half* inp_grad,
-                                   int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id >> WARP_SIZE_BITS;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    __half2 vals_arr[NORM_REG];
-    float2 vals_arr_f[NORM_REG];
-    __half2 xu[NORM_REG];
-
-    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
-    const __half2* out_grad_h = reinterpret_cast<const __half2*>(out_grad);
-    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(X_vals);
-
-    inp_grad_h += (row * row_stride);
-    out_grad_h += (row * row_stride);
-    vals_hat_h += (row * row_stride);
-
-    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
-    int high_index = iterations * iteration_stride + id;
-
-    __half mean_h = means[row];
-    __half2 mean_reg = __halves2half2(mean_h, mean_h);
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
-        vals_arr[i] = out_grad_h[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;  // out_grad * gamma
-        xu[i] = (vals_hat_h[i * iteration_stride + id] - mean_reg);
-    }
-    if ((high_index) < row_stride) {
-        __half2 gamma_reg = gamma_h[high_index];
-        vals_arr[iterations] = out_grad_h[high_index];
-        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
-        xu[iterations] = (vals_hat_h[high_index] - mean_reg);
-        iterations++;
-    }
-    __half var_h = vars[row];
-    __half2 var_reg = __halves2half2(var_h, var_h);
-
-    float sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        __half2 result_h = (xu[i] * vals_arr[i]);
-        float2 result_f = __half22float2(result_h);
-        sum += result_f.x;
-        sum += result_f.y;
-        vals_arr[i] *= h2rsqrt(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-    __half2 sum_h = __float2half2_rn(sum);
-
-    for (int i = 0; i < iterations; i++) {
-        __half2 xu_grad = ((-sum_h * xu[i] * h2rsqrt(var_reg)) / (var_reg));
-        vals_arr_f[i] = __half22float2(vals_arr[i]);
-        float2 xu_grad_f = __half22float2(xu_grad);
-        vals_arr_f[i].x += xu_grad_f.x;
-        vals_arr_f[i].y += xu_grad_f.y;
-    }
-
-    sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        sum += (vals_arr_f[i].x);
-        sum += (vals_arr_f[i].y);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr_f[i].x -= sum;
-        vals_arr_f[i].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[i]);
-        inp_grad_h[i * iteration_stride + id] = temp;
-    }
-    if ((high_index) < row_stride) {
-        vals_arr_f[iterations].x -= sum;
-        vals_arr_f[iterations].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
-        inp_grad_h[high_index] = temp;
-    }
-}
-
-template <>
-void launch_layerNorm_backward<float>(const float* out_grad,
-                                      const float* X_data,
-                                      const float* vars,
-                                      const float* means,
-                                      const float* gamma,
-                                      float* gamma_grad,
-                                      float* betta_grad,
-                                      float* inp_grad,
-                                      int batch,
-                                      int hidden_dim,
-                                      cudaStream_t stream[2])
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    LayerNormBackward1<float><<<grid_dim, block_dim, 0, stream[0]>>>(
-        out_grad, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads);
-    LayerNormBackward2<<<grid_dim2, block_dim2, 0, stream[1]>>>(
-        out_grad, X_data, gamma, vars, means, inp_grad, hidden_dim);
-}
-
-template <>
-void launch_layerNorm_backward<__half>(const __half* out_grad,
-                                       const __half* X_data,
-                                       const __half* vars,
-                                       const __half* means,
-                                       const __half* gamma,
-                                       __half* gamma_grad,
-                                       __half* betta_grad,
-                                       __half* inp_grad,
-                                       int batch,
-                                       int hidden_dim,
-                                       cudaStream_t stream[2])
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    LayerNormBackward1<__half><<<grid_dim, block_dim, 0, stream[0]>>>(
-        out_grad, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads / 2);
-    LayerNormBackward2<<<grid_dim2, block_dim2, 0, stream[1]>>>(
-        out_grad, X_data, gamma, vars, means, inp_grad, hidden_dim / 2);
-}
-
-template <typename T>
-__global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
-                                             const T* __restrict__ out_grad2,
-                                             const T* __restrict__ vals_hat,
-                                             const T* __restrict__ gamma,
-                                             const T* __restrict__ betta,
-                                             T* __restrict__ gamma_grad,
-                                             T* __restrict__ betta_grad,
-                                             int rows,
-                                             int width,
-                                             bool invertible)
-{
-    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
-    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int offset = threadIdx.y * width + idx;
-    int y_stride = width * TILE_DIM;
-
-    float betta_reg = (invertible ? (float)betta[idx] : 0.0f);
-    float gamma_reg = (float)gamma[idx];
-
-    // Loop across matrix height
-    float betta_tmp = 0;
-    float gamma_tmp = 0;
-    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-        float grad = (float)out_grad1[offset] + (float)out_grad2[offset];
-        float val = (invertible ? ((float)vals_hat[offset] - betta_reg) / gamma_reg
-                                : (float)vals_hat[offset]);
-        betta_tmp += grad;
-        gamma_tmp += (val * grad);
-
-        offset += y_stride;
-    }
-
-    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
-    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
-    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) {
-        s1 += g.shfl_down(s1, i);
-        s2 += g.shfl_down(s2, i);
-    }
-
-    if (threadIdx.x == 0) {
-        int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-        betta_grad[pos] = s1;
-        gamma_grad[pos] = s2;
-    }
-}
-
-template <typename T>
-__global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
-                                             const T* __restrict__ out_grad2,
-                                             const T* __restrict__ X_data,
-                                             const T* __restrict__ vars,
-                                             const T* __restrict__ means,
-                                             T* __restrict__ gamma_grad,
-                                             T* __restrict__ betta_grad,
-                                             int rows,
-                                             int width)
-{
-    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
-    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int offset = threadIdx.y * width + idx;
-    int y_stride = width * TILE_DIM;
-
-    int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-    // Loop across matrix height
-
-    float betta_tmp = 0;
-    float gamma_tmp = 0;
-    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-        float grad = (float)out_grad1[offset] + (float)out_grad2[offset];
-        float val = (float)X_data[offset];
-        val = (val - (float)means[r]) * rsqrtf((float)vars[r]);
-        betta_tmp += grad;
-        gamma_tmp += (val * grad);
-
-        offset += y_stride;
-    }
-
-    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
-    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
-    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) {
-        s1 += g.shfl_down(s1, i);
-        s2 += g.shfl_down(s2, i);
-    }
-
-    if (threadIdx.x == 0) {
-        betta_grad[pos] = s1;
-        gamma_grad[pos] = s2;
-    }
-}
-
-__global__ void LayerNormBackward2_fused_add(const float* out_grad1,
-                                             const float* out_grad2,
-                                             const float* vals_hat,
-                                             const float* gamma,
-                                             const float* betta,
-                                             const float* vars,
-                                             float* inp_grad,
-                                             bool invertible,
-                                             int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    out_grad1 += (row * row_stride);
-    out_grad2 += (row * row_stride);
-    vals_hat += (row * row_stride);
-    inp_grad += (row * row_stride);
-
-    float vals_arr[NORM_REG];
-    float vals_hat_arr[NORM_REG];
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        float gamma_reg = gamma[i * iteration_stride + id];
-        vals_arr[i] = out_grad1[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;
-        vals_hat_arr[i] =
-            (invertible ? (vals_hat[i * iteration_stride + id] - betta[i * iteration_stride + id]) /
-                              gamma_reg
-                        : vals_hat[i * iteration_stride + id]);
-    }
-    if ((high_index) < row_stride) {
-        float gamma_reg = gamma[high_index];
-        vals_arr[iterations] = out_grad1[high_index];
-        vals_arr[iterations] *= gamma_reg;
-        vals_hat_arr[iterations] =
-            (invertible ? (vals_hat[high_index] - betta[high_index]) / gamma_reg
-                        : vals_hat[high_index]);
-        iterations++;
-    }
-
-    float var_reg = vars[row];
-
-    float sum = 0;
-    for (int i = 0; i < iterations; i++) {
-        sum += vals_hat_arr[i] * vals_arr[i] * sqrtf(var_reg);
-        vals_arr[i] *= rsqrtf(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    for (int i = 0; i < iterations; i++) { vals_arr[i] += ((-sum * vals_hat_arr[i]) / var_reg); }
-
-    sum = 0;
-    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++)
-        inp_grad[i * iteration_stride + id] =
-            (vals_arr[i] - sum) + out_grad2[i * iteration_stride + id];
-    if ((high_index) < row_stride)
-        inp_grad[high_index] = (vals_arr[iterations] - sum) + out_grad2[high_index];
-}
-
-__global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
-                                             const __half* out_grad2,
-                                             const __half* vals_hat,
-                                             const __half* gamma,
-                                             const __half* betta,
-                                             const __half* vars,
-                                             __half* inp_grad,
-                                             bool invertible,
-                                             int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    __half2 vals_arr[NORM_REG];
-    float2 vals_arr_f[NORM_REG];
-    __half2 vals_hat_arr[NORM_REG];
-
-    // float2 result[iterations];
-
-    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
-    const __half2* out_grad_h1 = reinterpret_cast<const __half2*>(out_grad1);
-    const __half2* out_grad_h2 = reinterpret_cast<const __half2*>(out_grad2);
-    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(vals_hat);
-
-    inp_grad_h += (row * row_stride);
-    out_grad_h1 += (row * row_stride);
-    out_grad_h2 += (row * row_stride);
-    vals_hat_h += (row * row_stride);
-
-    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
-    const __half2* betta_h = (invertible ? reinterpret_cast<const __half2*>(betta) : nullptr);
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
-        vals_arr[i] = out_grad_h1[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;  // out_grad * gamma
-        vals_hat_arr[i] =
-            (invertible
-                 ? (vals_hat_h[i * iteration_stride + id] - betta_h[i * iteration_stride + id]) /
-                       gamma_reg
-                 : vals_hat_h[i * iteration_stride + id]);
-    }
-    if ((high_index) < row_stride) {
-        __half2 gamma_reg = gamma_h[high_index];
-        vals_arr[iterations] = out_grad_h1[high_index];
-        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
-        vals_hat_arr[iterations] =
-            (invertible ? (vals_hat_h[high_index] - betta_h[high_index]) / gamma_reg
-                        : vals_hat_h[high_index]);
-        iterations++;
-    }
-    __half var_h = vars[row];
-    __half2 var_reg = __halves2half2(var_h, var_h);
-
-    float sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        __half2 result_h = (vals_hat_arr[i] * vals_arr[i] * h2sqrt(var_reg));
-        float2 result_f = __half22float2(result_h);
-        sum += result_f.x;
-        sum += result_f.y;
-        vals_arr[i] *= h2rsqrt(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-    __half2 sum_h = __float2half2_rn(sum);
-
-    for (int i = 0; i < iterations; i++) {
-        __half2 temp = ((-sum_h * vals_hat_arr[i]) / (var_reg));
-        vals_arr_f[i] = __half22float2(vals_arr[i]);
-        float2 temp_f = __half22float2(temp);
-        vals_arr_f[i].x += temp_f.x;
-        vals_arr_f[i].y += temp_f.y;
-    }
-    sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        sum += (vals_arr_f[i].x);
-        sum += (vals_arr_f[i].y);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr_f[i].x -= sum;
-        vals_arr_f[i].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[i]);
-
-        inp_grad_h[i * iteration_stride + id] = temp + out_grad_h2[i * iteration_stride + id];
-    }
-    if ((high_index) < row_stride) {
-        vals_arr_f[iterations].x -= sum;
-        vals_arr_f[iterations].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
-
-        inp_grad_h[high_index] = temp + out_grad_h2[high_index];
-    }
-}
-
-template <>
-void launch_layerNorm_backward_fused_add<float>(const float* out_grad1,
-                                                const float* out_grad2,
-                                                const float* vals_hat,
-                                                const float* vars,
-                                                const float* gamma,
-                                                float* gamma_grad,
-                                                float* betta_grad,
-                                                float* inp_grad,
-                                                int batch,
-                                                int hidden_dim,
-                                                cudaStream_t stream[2],
-                                                bool invertible,
-                                                const float* betta)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-    LayerNormBackward1<float><<<grid_dim, block_dim, 0, stream[0]>>>(
-        out_grad1, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads);
-    LayerNormBackward2_fused_add<<<grid_dim2, block_dim2, 0, stream[1]>>>(
-        out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim);
-}
-
-template <>
-void launch_layerNorm_backward_fused_add<__half>(const __half* out_grad1,
-                                                 const __half* out_grad2,
-                                                 const __half* vals_hat,
-                                                 const __half* vars,
-                                                 const __half* gamma,
-                                                 __half* gamma_grad,
-                                                 __half* betta_grad,
-                                                 __half* inp_grad,
-                                                 int batch,
-                                                 int hidden_dim,
-                                                 cudaStream_t stream[2],
-                                                 bool invertible,
-                                                 const __half* betta)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    LayerNormBackward1<__half><<<grid_dim, block_dim, 0, stream[0]>>>(
-        out_grad1, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads / 2);
-    LayerNormBackward2_fused_add<<<grid_dim2, block_dim2, 0, stream[1]>>>(
-        out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim / 2);
-}
-
-/* Backward Normalize (Input-Gradient)
- * Using the means and variances from the input
- * This type of backward is not invertible!
- * We do the backward using the input (X)
- */
-
-__global__ void LayerNormBackward2_fused_add(const float* out_grad1,
-                                             const float* out_grad2,
-                                             const float* X_vals,
-                                             const float* gamma,
-                                             const float* vars,
-                                             const float* means,
-                                             float* inp_grad,
-                                             int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    float vals_arr[NORM_REG];
-    float vals_hat_arr[NORM_REG];
-
-    out_grad1 += (row * row_stride);
-    out_grad2 += (row * row_stride);
-    X_vals += (row * row_stride);
-    inp_grad += (row * row_stride);
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        float gamma_reg = gamma[i * iteration_stride + id];
-        vals_arr[i] = out_grad1[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;
-        vals_hat_arr[i] = X_vals[i * iteration_stride + id];
-    }
-    if ((high_index) < row_stride) {
-        float gamma_reg = gamma[high_index];
-        vals_arr[iterations] = out_grad1[high_index];
-        vals_arr[iterations] *= gamma_reg;
-        vals_hat_arr[iterations] = X_vals[high_index];
-        iterations++;
-    }
-
-    float var_reg = vars[row];
-    float mean_reg = means[row];
-
-    float sum = 0;
-    float xu[NORM_REG];
-    for (int i = 0; i < iterations; i++) {
-        xu[i] = (vals_hat_arr[i] - mean_reg);
-        sum += vals_arr[i] * xu[i];
-        vals_arr[i] *= rsqrtf(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] += (-sum * xu[i] * rsqrtf(var_reg) / (var_reg));
-    }
-
-    sum = 0;
-    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++)
-        inp_grad[i * iteration_stride + id] =
-            (vals_arr[i] - sum) + out_grad2[i * iteration_stride + id];
-    if ((high_index) < row_stride)
-        inp_grad[high_index] = (vals_arr[iterations] - sum) + out_grad2[high_index];
-}
-
-__global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
-                                             const __half* out_grad2,
-                                             const __half* X_vals,
-                                             const __half* gamma,
-                                             const __half* vars,
-                                             const __half* means,
-                                             __half* inp_grad,
-                                             int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    __half2 vals_arr[NORM_REG];
-    float2 vals_arr_f[NORM_REG];
-    __half2 vals_hat_arr[NORM_REG];
-
-    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
-    const __half2* out_grad_h1 = reinterpret_cast<const __half2*>(out_grad1);
-    const __half2* out_grad_h2 = reinterpret_cast<const __half2*>(out_grad2);
-    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(X_vals);
-
-    out_grad_h1 += (row * row_stride);
-    out_grad_h2 += (row * row_stride);
-    inp_grad_h += (row * row_stride);
-    vals_hat_h += (row * row_stride);
-
-    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
-        vals_arr[i] = out_grad_h1[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;  // out_grad * gamma
-        vals_hat_arr[i] = vals_hat_h[i * iteration_stride + id];
-    }
-    if ((high_index) < row_stride) {
-        __half2 gamma_reg = gamma_h[high_index];
-        vals_arr[iterations] = out_grad_h1[high_index];
-        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
-        vals_hat_arr[iterations] = vals_hat_h[high_index];
-        iterations++;
-    }
-
-    __half mean_h = means[row];
-    __half var_h = vars[row];
-    __half2 var_reg = __halves2half2(var_h, var_h);
-    __half2 mean_reg = __halves2half2(mean_h, mean_h);
-    __half2 xu[NORM_REG];
-
-    float sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        xu[i] = (vals_hat_arr[i] - mean_reg);
-        __half2 result_h = (xu[i] * vals_arr[i]);
-        float2 result_f = __half22float2(result_h);
-        sum += result_f.x;
-        sum += result_f.y;
-        vals_arr[i] *= h2rsqrt(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-    __half2 sum_h = __float2half2_rn(sum);
-
-    for (int i = 0; i < iterations; i++) {
-        __half2 xu_grad = ((-sum_h * xu[i] * h2rsqrt(var_reg)) / (var_reg));
-        vals_arr_f[i] = __half22float2(vals_arr[i]);
-        float2 xu_grad_f = __half22float2(xu_grad);
-        vals_arr_f[i].x += xu_grad_f.x;
-        vals_arr_f[i].y += xu_grad_f.y;
-    }
-
-    sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        sum += (vals_arr_f[i].x);
-        sum += (vals_arr_f[i].y);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr_f[i].x -= sum;
-        vals_arr_f[i].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[i]);
-        inp_grad_h[i * iteration_stride + id] = temp + out_grad_h2[i * iteration_stride + id];
-    }
-    if ((high_index) < row_stride) {
-        vals_arr_f[iterations].x -= sum;
-        vals_arr_f[iterations].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
-        inp_grad_h[high_index] = temp + out_grad_h2[high_index];
-    }
-}
-
-template <>
-void launch_layerNorm_backward_fused_add<float>(const float* out_grad1,
-                                                const float* out_grad2,
-                                                const float* X_data,
-                                                const float* vars,
-                                                const float* means,
-                                                const float* gamma,
-                                                float* gamma_grad,
-                                                float* betta_grad,
-                                                float* inp_grad,
-                                                int batch,
-                                                int hidden_dim,
-                                                cudaStream_t stream[2])
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    LayerNormBackward1<float><<<grid_dim, block_dim, 0, stream[0]>>>(
-        out_grad1, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads);
-    LayerNormBackward2_fused_add<<<grid_dim2, block_dim2, 0, stream[1]>>>(
-        out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad, hidden_dim);
-}
-
-template <>
-void launch_layerNorm_backward_fused_add<__half>(const __half* out_grad1,
-                                                 const __half* out_grad2,
-                                                 const __half* X_data,
-                                                 const __half* vars,
-                                                 const __half* means,
-                                                 const __half* gamma,
-                                                 __half* gamma_grad,
-                                                 __half* betta_grad,
-                                                 __half* inp_grad,
-                                                 int batch,
-                                                 int hidden_dim,
-                                                 cudaStream_t stream[2])
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    LayerNormBackward1<__half><<<grid_dim, block_dim, 0, stream[0]>>>(
-        out_grad1, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads / 2);
-    LayerNormBackward2_fused_add<<<grid_dim2, block_dim2, 0, stream[1]>>>(
-        out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad, hidden_dim / 2);
-}
diff --git a/deepspeed/ops/csrc/transformer_bak/normalize_kernels.hip b/deepspeed/ops/csrc/transformer_bak/normalize_kernels.hip
deleted file mode 100644
index 3d1b17c8f779f0940593a66fea8c07bba6c5534c..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/normalize_kernels.hip
+++ /dev/null
@@ -1,2123 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-
-namespace cg = cooperative_groups;
-
-/*
-Fused bias add, residual (elementwise) add, and normalization layer.
-
-For FP16, this kernel does not promote to FP32 in order to utilize the 2x throughput for
-__half2 instructions, and avoid the conversion overhead (1/8 of __hal2 arithmetic).
-
-For specific launch constraints, see the launch functions.
-*/
-
-#define NORM_REG (MAX_REGISTERS / 4)
-
-__global__ void fused_bias_residual_layer_norm(float* vals,
-                                               const float* residual,
-                                               const float* gamma,
-                                               const float* beta,
-                                               float epsilon,
-                                               bool preLayerNorm,
-                                               bool training,
-                                               float* vars,
-                                               float* means,
-                                               int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id / WARP_SIZE;
-
-    float vals_arr[NORM_REG];
-    __shared__ float shr[MAX_WARP_NUM];
-
-    residual += (row * row_stride);
-    vals += (row * row_stride);
-
-    float sum = 0.f;
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = residual[i * iteration_stride + id];
-        sum += vals_arr[i];
-    }
-    if (high_index < row_stride) {
-        vals_arr[iterations] = residual[high_index];
-        sum += vals_arr[iterations];
-        iterations++;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
-
-#if !defined(__STOCHASTIC_MODE__) || __CUDA_ARCH__ < 700
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        sum += g.shfl_down(sum, i);
-    }
-
-    sum = g.shfl(sum, 0);
-    float mean = sum / row_stride;
-    if (training)
-        if (threadIdx.x == 0) means[row] = mean;
-    float variance = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] -= mean;
-        variance += vals_arr[i] * vals_arr[i];
-    }
-
-    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = variance;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        variance += g.shfl_down(variance, i);
-    }
-    variance = g.shfl(variance, 0);
-    variance /= row_stride;
-    variance += epsilon;
-    if (training)
-        if (threadIdx.x == 0) vars[row] = variance;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = vals_arr[i] * rsqrtf(variance);
-        vals_arr[i] =
-            vals_arr[i] * gamma[i * iteration_stride + id] + beta[i * iteration_stride + id];
-        vals[i * iteration_stride + id] = vals_arr[i];
-    }
-    if ((high_index) < row_stride) {
-        vals_arr[iterations] = vals_arr[iterations] * rsqrtf(variance);
-        vals_arr[iterations] = vals_arr[iterations] * gamma[high_index] + beta[high_index];
-        vals[high_index] = vals_arr[iterations];
-    }
-}
-
-__global__ void fused_bias_residual_layer_norm(__half* vals,
-                                               const __half* residual,
-                                               const __half* gamma,
-                                               const __half* beta,
-                                               float epsilon,
-                                               bool preLayerNorm,
-                                               bool training,
-                                               __half* vars,
-                                               __half* means,
-                                               int row_stride)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> WARP_SIZE_BITS;
-
-    float2 vals_f[NORM_REG];
-    __shared__ float shr[MAX_WARP_NUM];
-
-    __half2* vals_cast = reinterpret_cast<__half2*>(vals);
-    const __half2* residual_cast = reinterpret_cast<const __half2*>(residual);
-
-    residual_cast += (row * row_stride);
-    vals_cast += (row * row_stride);
-
-    float sum = 0.f;
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        vals_f[i] = __half22float2(residual_cast[i * iteration_stride + id]);
-        sum += vals_f[i].x;
-        sum += vals_f[i].y;
-    }
-    if ((high_index) < row_stride) {
-        vals_f[iterations] = __half22float2(residual_cast[high_index]);
-        sum += vals_f[iterations].x;
-        sum += vals_f[iterations].y;
-        iterations++;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        sum += g.shfl_down(sum, i);
-    }
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride * 2);
-
-    float variance = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        vals_f[i].x -= mean;
-        vals_f[i].y -= mean;
-        variance += vals_f[i].x * vals_f[i].x;
-        variance += vals_f[i].y * vals_f[i].y;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = variance;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        variance += g.shfl_down(variance, i);
-    }
-    variance = g.shfl(variance, 0);
-    variance /= (row_stride * 2);
-    variance += epsilon;
-
-    __half2 variance_h = __float2half2_rn(variance);
-    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
-    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
-
-    if (training && threadIdx.x == 0) {
-        vars[row] = __float2half(variance);
-        means[row] = __float2half(mean);
-    }
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        __half2 vals_arr = __float22half2_rn(vals_f[i]);
-        vals_arr = vals_arr * h2rsqrt(variance_h);
-        vals_arr =
-            vals_arr * gamma_cast[i * iteration_stride + id] + beta_cast[i * iteration_stride + id];
-        vals_cast[i * iteration_stride + id] = vals_arr;
-    }
-    if ((high_index) < row_stride) {
-        __half2 vals_arr = __float22half2_rn(vals_f[iterations]);
-        vals_arr = vals_arr * h2rsqrt(variance_h);
-        vals_arr = vals_arr * gamma_cast[high_index] + beta_cast[high_index];
-        vals_cast[high_index] = vals_arr;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_residual_layer_norm(T* vals,
-                                     const T* residual,
-                                     const T* gamma,
-                                     const T* beta,
-                                     float epsilon,
-                                     int batch_size,
-                                     int hidden_dim,
-                                     hipStream_t stream,
-                                     bool preLayerNorm,
-                                     bool training,
-                                     T* vars,
-                                     T* means);
-
-template <>
-void launch_bias_residual_layer_norm<float>(float* vals,
-                                            const float* residual,
-                                            const float* gamma,
-                                            const float* beta,
-                                            float epsilon,
-                                            int batch_size,
-                                            int hidden_dim,
-                                            hipStream_t stream,
-                                            bool preLayerNorm,
-                                            bool training,
-                                            float* vars,
-                                            float* means)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(batch_size);
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim(threads);
-
-   hipLaunchKernelGGL(( fused_bias_residual_layer_norm), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, hidden_dim);
-}
-
-template <>
-void launch_bias_residual_layer_norm<__half>(__half* vals,
-                                             const __half* residual,
-                                             const __half* gamma,
-                                             const __half* beta,
-                                             float epsilon,
-                                             int batch_size,
-                                             int hidden_dim,
-                                             hipStream_t stream,
-                                             bool preLayerNorm,
-                                             bool training,
-                                             __half* vars,
-                                             __half* means)
-{
-    int threads = 128;
-
-    dim3 grid_dim(batch_size);
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim(threads);
-
-   hipLaunchKernelGGL(( fused_bias_residual_layer_norm), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, hidden_dim / 2);
-}
-
-__global__ void fused_bias_residual_layer_norm(float* vals,
-                                               const float* residual,
-                                               const float* gamma,
-                                               const float* beta,
-                                               float epsilon,
-                                               bool preLayerNorm,
-                                               bool training,
-                                               float* vars,
-                                               int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id / 32;
-
-    float vals_arr[NORM_REG];
-    __shared__ float shr[MAX_WARP_NUM];
-
-    residual += (row * row_stride);
-    vals += (row * row_stride);
-
-    float sum = 0.f;
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = residual[i * iteration_stride + id];
-        sum += vals_arr[i];
-    }
-    if ((high_index) < row_stride) {
-        vals_arr[iterations] = residual[high_index];
-        sum += vals_arr[iterations];
-        iterations++;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
-
-#if !defined(__STOCHASTIC_MODE__) || __CUDA_ARCH__ < 700
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        sum += g.shfl_down(sum, i);
-    }
-
-    sum = g.shfl(sum, 0);
-    float mean = sum / row_stride;
-    float variance = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] -= mean;
-        variance += vals_arr[i] * vals_arr[i];
-    }
-
-    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = variance;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        variance += g.shfl_down(variance, i);
-    }
-    variance = g.shfl(variance, 0);
-    variance /= row_stride;
-    variance += epsilon;
-    if (training)
-        if (threadIdx.x == 0) vars[row] = variance;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = vals_arr[i] * rsqrtf(variance);
-        vals_arr[i] =
-            vals_arr[i] * gamma[i * iteration_stride + id] + beta[i * iteration_stride + id];
-        vals[i * iteration_stride + id] = vals_arr[i];
-    }
-    if ((high_index) < row_stride) {
-        vals_arr[iterations] = vals_arr[iterations] * rsqrtf(variance);
-        vals_arr[iterations] = vals_arr[iterations] * gamma[high_index] + beta[high_index];
-        vals[high_index] = vals_arr[iterations];
-    }
-}
-
-__global__ void fused_bias_residual_layer_norm(__half* vals,
-                                               const __half* residual,
-                                               const __half* gamma,
-                                               const __half* beta,
-                                               float epsilon,
-                                               bool preLayerNorm,
-                                               bool training,
-                                               __half* vars,
-                                               int row_stride)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> WARP_SIZE_BITS;
-
-    float2 vals_f[NORM_REG];
-    __shared__ float shr[MAX_WARP_NUM];
-
-    __half2* vals_cast = reinterpret_cast<__half2*>(vals);
-    const __half2* residual_cast = reinterpret_cast<const __half2*>(residual);
-
-    residual_cast += (row * row_stride);
-    vals_cast += (row * row_stride);
-
-    float sum = 0.f;
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        vals_f[i] = __half22float2(residual_cast[i * iteration_stride + id]);
-        sum += vals_f[i].x;
-        sum += vals_f[i].y;
-    }
-    if ((high_index) < row_stride) {
-        vals_f[iterations] = __half22float2(residual_cast[high_index]);
-        sum += vals_f[iterations].x;
-        sum += vals_f[iterations].y;
-        iterations++;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        sum += g.shfl_down(sum, i);
-    }
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride * 2);
-
-    float variance = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        vals_f[i].x -= mean;
-        vals_f[i].y -= mean;
-        variance += vals_f[i].x * vals_f[i].x;
-        variance += vals_f[i].y * vals_f[i].y;
-    }
-
-    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
-
-    if (g.thread_rank() == 0) shr[gid] = variance;
-
-    b.sync();
-
-    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    b.sync();
-#endif
-
-    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
-        variance += g.shfl_down(variance, i);
-    }
-    variance = g.shfl(variance, 0);
-    variance /= (row_stride * 2);
-    variance += epsilon;
-
-    __half2 variance_h = __float2half2_rn(variance);
-    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
-    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
-
-    if (training && threadIdx.x == 0) vars[row] = __float2half(variance);
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        __half2 vals_arr = __float22half2_rn(vals_f[i]);
-        vals_arr = vals_arr * h2rsqrt(variance_h);
-        vals_arr =
-            vals_arr * gamma_cast[i * iteration_stride + id] + beta_cast[i * iteration_stride + id];
-        vals_cast[i * iteration_stride + id] = vals_arr;
-    }
-    if ((high_index) < row_stride) {
-        __half2 vals_arr = __float22half2_rn(vals_f[iterations]);
-        vals_arr = vals_arr * h2rsqrt(variance_h);
-        vals_arr = vals_arr * gamma_cast[high_index] + beta_cast[high_index];
-        vals_cast[high_index] = vals_arr;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_residual_layer_norm(T* vals,
-                                     const T* residual,
-                                     const T* gamma,
-                                     const T* beta,
-                                     float epsilon,
-                                     int batch_size,
-                                     int hidden_dim,
-                                     hipStream_t stream,
-                                     bool preLayerNorm,
-                                     bool training,
-                                     T* vars);
-
-/*
-To tune this launch the following restrictions must be met:
-
-For float:
-row_stride == hidden_size
-threads * iterations == row_stride
-threads is in [32, 64, 128, 256, 512, 1024]
-
-For half:
-row_stride == hidden_size / 2
-threads * iterations == row_stride
-threads is in [32, 64, 128, 256, 512, 1024]
-
-*/
-
-template <>
-void launch_bias_residual_layer_norm<float>(float* vals,
-                                            const float* residual,
-                                            const float* gamma,
-                                            const float* beta,
-                                            float epsilon,
-                                            int batch_size,
-                                            int hidden_dim,
-                                            hipStream_t stream,
-                                            bool preLayerNorm,
-                                            bool training,
-                                            float* vars)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(batch_size);
-
-    // There are some limitations to call below functions, now just enumerate the situations.
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim(threads);
-
-   hipLaunchKernelGGL(( fused_bias_residual_layer_norm), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, hidden_dim);
-}
-
-template <>
-void launch_bias_residual_layer_norm<__half>(__half* vals,
-                                             const __half* residual,
-                                             const __half* gamma,
-                                             const __half* beta,
-                                             float epsilon,
-                                             int batch_size,
-                                             int hidden_dim,
-                                             hipStream_t stream,
-                                             bool preLayerNorm,
-                                             bool training,
-                                             __half* vars)
-{
-    int threads = 128;
-
-    dim3 grid_dim(batch_size);
-
-    // There are some limitations to call below functions, now just enumerate the situations.
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim(threads);
-   hipLaunchKernelGGL(( fused_bias_residual_layer_norm), dim3(grid_dim), dim3(block_dim), 0, stream, 
-        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, hidden_dim / 2);
-}
-
-/* Normalize Gamma & Betta gradients
- * Compute gradients using either X_hat or
- * normalize input (invertible).
- * Combine transpose with gradients computation.
- */
-
-template <typename T>
-__global__ void LayerNormBackward1(const T* __restrict__ out_grad,
-                                   const T* __restrict__ vals_hat,
-                                   const T* __restrict__ gamma,
-                                   const T* __restrict__ betta,
-                                   T* __restrict__ gamma_grad,
-                                   T* __restrict__ betta_grad,
-                                   int rows,
-                                   int width,
-                                   bool invertible)
-{
-    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
-    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int offset = threadIdx.y * width + idx;
-    int y_stride = width * TILE_DIM;
-
-    float betta_reg = (invertible ? (float)betta[idx] : 0.0f);
-    float gamma_reg = (float)gamma[idx];
-
-    // Loop across matrix height
-    float betta_tmp = 0;
-    float gamma_tmp = 0;
-    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-        float grad = (float)out_grad[offset];
-        float val = (invertible ? ((float)vals_hat[offset] - betta_reg) / gamma_reg
-                                : (float)vals_hat[offset]);
-        betta_tmp += grad;
-        gamma_tmp += (val * grad);
-
-        offset += y_stride;
-    }
-
-    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
-    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
-    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) {
-        s1 += g.shfl_down(s1, i);
-        s2 += g.shfl_down(s2, i);
-    }
-
-    if (threadIdx.x == 0) {
-        int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-        betta_grad[pos] = s1;
-        gamma_grad[pos] = s2;
-    }
-}
-
-/* Normalize Gamma & Betta gradients
- * Compute gradients using the input to
- * the normalize.
- * Combine transpose with gradients computation.
- */
-
-template <typename T>
-__global__ void LayerNormBackward1(const T* __restrict__ out_grad,
-                                   const T* __restrict__ X_data,
-                                   const T* __restrict__ vars,
-                                   const T* __restrict__ means,
-                                   T* __restrict__ gamma_grad,
-                                   T* __restrict__ betta_grad,
-                                   int rows,
-                                   int width)
-{
-    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
-    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int offset = threadIdx.y * width + idx;
-    int y_stride = width * TILE_DIM;
-
-    int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-    // Loop across matrix height
-
-    float betta_tmp = 0;
-    float gamma_tmp = 0;
-    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-        float grad = (float)out_grad[offset];
-        float val = (float)X_data[offset];
-        val = (val - (float)means[r]) * rsqrtf((float)vars[r]);
-        betta_tmp += grad;
-        gamma_tmp += (val * grad);
-
-        offset += y_stride;
-    }
-
-    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
-    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
-    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) {
-        s1 += g.shfl_down(s1, i);
-        s2 += g.shfl_down(s2, i);
-    }
-
-    if (threadIdx.x == 0) {
-        betta_grad[pos] = s1;
-        gamma_grad[pos] = s2;
-    }
-}
-/*
-
-/* Backward Normalize (Input-Gradient)
- * Using the means and variances from the input
- * This type of backward is invertible!
- * We do the backward using the X_hat (X - u) / sqrt(variance) or the output of Normalization.
- */
-
-__global__ void LayerNormBackward2(const float* out_grad,
-                                   const float* vals_hat,
-                                   const float* gamma,
-                                   const float* betta,
-                                   const float* vars,
-                                   float* inp_grad,
-                                   bool invertible,
-                                   int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    out_grad += (row * row_stride);
-    vals_hat += (row * row_stride);
-    inp_grad += (row * row_stride);
-
-    float vals_arr[NORM_REG];
-    float vals_hat_arr[NORM_REG];
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        float gamma_reg = gamma[i * iteration_stride + id];
-        vals_arr[i] = out_grad[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;
-        vals_hat_arr[i] =
-            (invertible ? (vals_hat[i * iteration_stride + id] - betta[i * iteration_stride + id]) /
-                              gamma_reg
-                        : vals_hat[i * iteration_stride + id]);
-    }
-    if ((high_index) < row_stride) {
-        float gamma_reg = gamma[high_index];
-        vals_arr[iterations] = out_grad[high_index];
-        vals_arr[iterations] *= gamma_reg;
-        vals_hat_arr[iterations] =
-            (invertible ? (vals_hat[high_index] - betta[high_index]) / gamma_reg
-                        : vals_hat[high_index]);
-        iterations++;
-    }
-
-    float var_reg = vars[row];
-
-    float sum = 0;
-    for (int i = 0; i < iterations; i++) {
-        sum += vals_hat_arr[i] * vals_arr[i] *
-               sqrtf(var_reg);           // dval_hat = gamma * (x - u) * out_grad
-        vals_arr[i] *= rsqrtf(var_reg);  // dvar_inv = gamma * out_grad / sqrt(var)
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    for (int i = 0; i < iterations; i++) { vals_arr[i] += ((-sum * vals_hat_arr[i]) / var_reg); }
-
-    sum = 0;
-    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) inp_grad[i * iteration_stride + id] = (vals_arr[i] - sum);
-    if ((high_index) < row_stride) inp_grad[high_index] = (vals_arr[iterations] - sum);
-}
-
-__global__ void LayerNormBackward2(const __half* out_grad,
-                                   const __half* vals_hat,
-                                   const __half* gamma,
-                                   const __half* betta,
-                                   const __half* vars,
-                                   __half* inp_grad,
-                                   bool invertible,
-                                   int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    __half2 vals_arr[NORM_REG];
-    float2 vals_arr_f[NORM_REG];
-    __half2 vals_hat_arr[NORM_REG];
-
-    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
-    const __half2* out_grad_h = reinterpret_cast<const __half2*>(out_grad);
-    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(vals_hat);
-
-    inp_grad_h += (row * row_stride);
-    out_grad_h += (row * row_stride);
-    vals_hat_h += (row * row_stride);
-
-    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
-    const __half2* betta_h = (invertible ? reinterpret_cast<const __half2*>(betta) : nullptr);
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
-        vals_arr[i] = out_grad_h[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;
-        vals_hat_arr[i] =
-            (invertible
-                 ? (vals_hat_h[i * iteration_stride + id] - betta_h[i * iteration_stride + id]) /
-                       gamma_reg
-                 : vals_hat_h[i * iteration_stride + id]);
-    }
-    if ((high_index) < row_stride) {
-        __half2 gamma_reg = gamma_h[high_index];
-        vals_arr[iterations] = out_grad_h[high_index];
-        vals_arr[iterations] *= gamma_reg;
-        vals_hat_arr[iterations] =
-            (invertible ? (vals_hat_h[high_index] - betta_h[high_index]) / gamma_reg
-                        : vals_hat_h[high_index]);
-        iterations++;
-    }
-    __half var_h = vars[row];
-    __half2 var_reg = __halves2half2(var_h, var_h);
-
-    float sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        __half2 result_h = (vals_hat_arr[i] * vals_arr[i] * h2sqrt(var_reg));
-        float2 result_f = __half22float2(result_h);
-        sum += result_f.x;
-        sum += result_f.y;
-        vals_arr[i] *= h2rsqrt(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-    __half2 sum_h = __float2half2_rn(sum);
-
-    for (int i = 0; i < iterations; i++) {
-        __half2 temp = ((-sum_h * vals_hat_arr[i]) / (var_reg));
-        vals_arr_f[i] = __half22float2(vals_arr[i]);
-        float2 temp_f = __half22float2(temp);
-        vals_arr_f[i].x += temp_f.x;
-        vals_arr_f[i].y += temp_f.y;
-    }
-    sum = 0.f;
-
-    for (int i = 0; i < iterations; i++) {
-        sum += (vals_arr_f[i].x);
-        sum += (vals_arr_f[i].y);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr_f[i].x -= sum;
-        vals_arr_f[i].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[i]);
-
-        inp_grad_h[i * iteration_stride + id] = temp;
-    }
-    if ((high_index) < row_stride) {
-        vals_arr_f[iterations].x -= sum;
-        vals_arr_f[iterations].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
-
-        inp_grad_h[high_index] = temp;
-    }
-}
-
-template <>
-void launch_layerNorm_backward<float>(const float* out_grad,
-                                      const float* vals_hat,
-                                      const float* vars,
-                                      const float* gamma,
-                                      float* gamma_grad,
-                                      float* betta_grad,
-                                      float* inp_grad,
-                                      int batch,
-                                      int hidden_dim,
-                                      hipStream_t stream[2],
-                                      bool invertible,
-                                      const float* betta)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-   hipLaunchKernelGGL(( LayerNormBackward1<float>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
-        out_grad, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads);
-
-   hipLaunchKernelGGL(( LayerNormBackward2), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
-        out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim);
-}
-
-template <>
-void launch_layerNorm_backward<__half>(const __half* out_grad,
-                                       const __half* vals_hat,
-                                       const __half* vars,
-                                       const __half* gamma,
-                                       __half* gamma_grad,
-                                       __half* betta_grad,
-                                       __half* inp_grad,
-                                       int batch,
-                                       int hidden_dim,
-                                       hipStream_t stream[2],
-                                       bool invertible,
-                                       const __half* betta)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    //hipLaunchKernelGGL(( LayerNormBackward1<__half>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
-    //    out_grad, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads / 2);
-
-   hipLaunchKernelGGL(( LayerNormBackward2), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
-        out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim / 2);
-}
-
-/* Backward Normalize (Input-Gradient)
- * Using the means and variances from the input
- * This type of backward is not invertible!
- * We do the backward using the input (X)
- */
-
-__global__ void LayerNormBackward2(const float* out_grad,
-                                   const float* X_vals,
-                                   const float* gamma,
-                                   const float* vars,
-                                   const float* means,
-                                   float* inp_grad,
-                                   int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id >> WARP_SIZE_BITS;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    out_grad += (row * row_stride);
-    X_vals += (row * row_stride);
-    inp_grad += (row * row_stride);
-
-    float vals_arr[NORM_REG];
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        float gamma_reg = gamma[i * iteration_stride + id];
-        vals_arr[i] = out_grad[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;
-    }
-    if ((high_index) < row_stride) {
-        float gamma_reg = gamma[high_index];
-        vals_arr[iterations] = out_grad[high_index];
-        vals_arr[iterations] *= gamma_reg;
-        iterations++;
-    }
-
-    float var_reg = vars[row];
-    float mean_reg = means[row];
-
-    float sum = 0;
-    float xu[NORM_REG];
-    for (int i = 0; i < iterations; i++) {
-        xu[i] = (X_vals[i * iteration_stride + id] - mean_reg);
-        sum += vals_arr[i] * xu[i];
-        vals_arr[i] *= rsqrtf(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] += (-sum * xu[i] * rsqrtf(var_reg) / (var_reg));
-    }
-
-    sum = 0;
-    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) inp_grad[i * iteration_stride + id] = (vals_arr[i] - sum);
-    if ((high_index) < row_stride) inp_grad[high_index] = (vals_arr[iterations] - sum);
-}
-
-__global__ void LayerNormBackward2(const __half* out_grad,
-                                   const __half* X_vals,
-                                   const __half* gamma,
-                                   const __half* vars,
-                                   const __half* means,
-                                   __half* inp_grad,
-                                   int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id >> WARP_SIZE_BITS;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    __half2 vals_arr[NORM_REG];
-    float2 vals_arr_f[NORM_REG];
-    __half2 xu[NORM_REG];
-
-    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
-    const __half2* out_grad_h = reinterpret_cast<const __half2*>(out_grad);
-    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(X_vals);
-
-    inp_grad_h += (row * row_stride);
-    out_grad_h += (row * row_stride);
-    vals_hat_h += (row * row_stride);
-
-    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
-    int high_index = iterations * iteration_stride + id;
-
-    __half mean_h = means[row];
-    __half2 mean_reg = __halves2half2(mean_h, mean_h);
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
-        vals_arr[i] = out_grad_h[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;  // out_grad * gamma
-        xu[i] = (vals_hat_h[i * iteration_stride + id] - mean_reg);
-    }
-    if ((high_index) < row_stride) {
-        __half2 gamma_reg = gamma_h[high_index];
-        vals_arr[iterations] = out_grad_h[high_index];
-        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
-        xu[iterations] = (vals_hat_h[high_index] - mean_reg);
-        iterations++;
-    }
-    __half var_h = vars[row];
-    __half2 var_reg = __halves2half2(var_h, var_h);
-
-    float sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        __half2 result_h = (xu[i] * vals_arr[i]);
-        float2 result_f = __half22float2(result_h);
-        sum += result_f.x;
-        sum += result_f.y;
-        vals_arr[i] *= h2rsqrt(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-    __half2 sum_h = __float2half2_rn(sum);
-
-    for (int i = 0; i < iterations; i++) {
-        __half2 xu_grad = ((-sum_h * xu[i] * h2rsqrt(var_reg)) / (var_reg));
-        vals_arr_f[i] = __half22float2(vals_arr[i]);
-        float2 xu_grad_f = __half22float2(xu_grad);
-        vals_arr_f[i].x += xu_grad_f.x;
-        vals_arr_f[i].y += xu_grad_f.y;
-    }
-
-    sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        sum += (vals_arr_f[i].x);
-        sum += (vals_arr_f[i].y);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr_f[i].x -= sum;
-        vals_arr_f[i].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[i]);
-        inp_grad_h[i * iteration_stride + id] = temp;
-    }
-    if ((high_index) < row_stride) {
-        vals_arr_f[iterations].x -= sum;
-        vals_arr_f[iterations].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
-        inp_grad_h[high_index] = temp;
-    }
-}
-
-template <>
-void launch_layerNorm_backward<float>(const float* out_grad,
-                                      const float* X_data,
-                                      const float* vars,
-                                      const float* means,
-                                      const float* gamma,
-                                      float* gamma_grad,
-                                      float* betta_grad,
-                                      float* inp_grad,
-                                      int batch,
-                                      int hidden_dim,
-                                      hipStream_t stream[2])
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-   hipLaunchKernelGGL(( LayerNormBackward1<float>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
-        out_grad, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads);
-   hipLaunchKernelGGL(( LayerNormBackward2), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
-        out_grad, X_data, gamma, vars, means, inp_grad, hidden_dim);
-}
-
-template <>
-void launch_layerNorm_backward<__half>(const __half* out_grad,
-                                       const __half* X_data,
-                                       const __half* vars,
-                                       const __half* means,
-                                       const __half* gamma,
-                                       __half* gamma_grad,
-                                       __half* betta_grad,
-                                       __half* inp_grad,
-                                       int batch,
-                                       int hidden_dim,
-                                       hipStream_t stream[2])
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-   hipLaunchKernelGGL(( LayerNormBackward1<__half>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
-        out_grad, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads / 2);
-   hipLaunchKernelGGL(( LayerNormBackward2), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
-        out_grad, X_data, gamma, vars, means, inp_grad, hidden_dim / 2);
-}
-
-template <typename T>
-__global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
-                                             const T* __restrict__ out_grad2,
-                                             const T* __restrict__ vals_hat,
-                                             const T* __restrict__ gamma,
-                                             const T* __restrict__ betta,
-                                             T* __restrict__ gamma_grad,
-                                             T* __restrict__ betta_grad,
-                                             int rows,
-                                             int width,
-                                             bool invertible)
-{
-    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
-    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int offset = threadIdx.y * width + idx;
-    int y_stride = width * TILE_DIM;
-
-    float betta_reg = (invertible ? (float)betta[idx] : 0.0f);
-    float gamma_reg = (float)gamma[idx];
-
-    // Loop across matrix height
-    float betta_tmp = 0;
-    float gamma_tmp = 0;
-    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-        float grad = (float)out_grad1[offset] + (float)out_grad2[offset];
-        float val = (invertible ? ((float)vals_hat[offset] - betta_reg) / gamma_reg
-                                : (float)vals_hat[offset]);
-        betta_tmp += grad;
-        gamma_tmp += (val * grad);
-
-        offset += y_stride;
-    }
-
-    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
-    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
-    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) {
-        s1 += g.shfl_down(s1, i);
-        s2 += g.shfl_down(s2, i);
-    }
-
-    if (threadIdx.x == 0) {
-        int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-        betta_grad[pos] = s1;
-        gamma_grad[pos] = s2;
-    }
-}
-
-template <typename T>
-__global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
-                                             const T* __restrict__ out_grad2,
-                                             const T* __restrict__ X_data,
-                                             const T* __restrict__ vars,
-                                             const T* __restrict__ means,
-                                             T* __restrict__ gamma_grad,
-                                             T* __restrict__ betta_grad,
-                                             int rows,
-                                             int width)
-{
-    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
-    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int offset = threadIdx.y * width + idx;
-    int y_stride = width * TILE_DIM;
-
-    int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-    // Loop across matrix height
-
-    float betta_tmp = 0;
-    float gamma_tmp = 0;
-    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-        float grad = (float)out_grad1[offset] + (float)out_grad2[offset];
-        float val = (float)X_data[offset];
-        val = (val - (float)means[r]) * rsqrtf((float)vars[r]);
-        betta_tmp += grad;
-        gamma_tmp += (val * grad);
-
-        offset += y_stride;
-    }
-
-    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
-    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
-    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) {
-        s1 += g.shfl_down(s1, i);
-        s2 += g.shfl_down(s2, i);
-    }
-
-    if (threadIdx.x == 0) {
-        betta_grad[pos] = s1;
-        gamma_grad[pos] = s2;
-    }
-}
-
-__global__ void LayerNormBackward2_fused_add(const float* out_grad1,
-                                             const float* out_grad2,
-                                             const float* vals_hat,
-                                             const float* gamma,
-                                             const float* betta,
-                                             const float* vars,
-                                             float* inp_grad,
-                                             bool invertible,
-                                             int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    out_grad1 += (row * row_stride);
-    out_grad2 += (row * row_stride);
-    vals_hat += (row * row_stride);
-    inp_grad += (row * row_stride);
-
-    float vals_arr[NORM_REG];
-    float vals_hat_arr[NORM_REG];
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        float gamma_reg = gamma[i * iteration_stride + id];
-        vals_arr[i] = out_grad1[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;
-        vals_hat_arr[i] =
-            (invertible ? (vals_hat[i * iteration_stride + id] - betta[i * iteration_stride + id]) /
-                              gamma_reg
-                        : vals_hat[i * iteration_stride + id]);
-    }
-    if ((high_index) < row_stride) {
-        float gamma_reg = gamma[high_index];
-        vals_arr[iterations] = out_grad1[high_index];
-        vals_arr[iterations] *= gamma_reg;
-        vals_hat_arr[iterations] =
-            (invertible ? (vals_hat[high_index] - betta[high_index]) / gamma_reg
-                        : vals_hat[high_index]);
-        iterations++;
-    }
-
-    float var_reg = vars[row];
-
-    float sum = 0;
-    for (int i = 0; i < iterations; i++) {
-        sum += vals_hat_arr[i] * vals_arr[i] * sqrtf(var_reg);
-        vals_arr[i] *= rsqrtf(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    for (int i = 0; i < iterations; i++) { vals_arr[i] += ((-sum * vals_hat_arr[i]) / var_reg); }
-
-    sum = 0;
-    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++)
-        inp_grad[i * iteration_stride + id] =
-            (vals_arr[i] - sum) + out_grad2[i * iteration_stride + id];
-    if ((high_index) < row_stride)
-        inp_grad[high_index] = (vals_arr[iterations] - sum) + out_grad2[high_index];
-}
-
-__global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
-                                             const __half* out_grad2,
-                                             const __half* vals_hat,
-                                             const __half* gamma,
-                                             const __half* betta,
-                                             const __half* vars,
-                                             __half* inp_grad,
-                                             bool invertible,
-                                             int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    __half2 vals_arr[NORM_REG];
-    float2 vals_arr_f[NORM_REG];
-    __half2 vals_hat_arr[NORM_REG];
-
-    // float2 result[iterations];
-
-    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
-    const __half2* out_grad_h1 = reinterpret_cast<const __half2*>(out_grad1);
-    const __half2* out_grad_h2 = reinterpret_cast<const __half2*>(out_grad2);
-    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(vals_hat);
-
-    inp_grad_h += (row * row_stride);
-    out_grad_h1 += (row * row_stride);
-    out_grad_h2 += (row * row_stride);
-    vals_hat_h += (row * row_stride);
-
-    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
-    const __half2* betta_h = (invertible ? reinterpret_cast<const __half2*>(betta) : nullptr);
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
-        vals_arr[i] = out_grad_h1[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;  // out_grad * gamma
-        vals_hat_arr[i] =
-            (invertible
-                 ? (vals_hat_h[i * iteration_stride + id] - betta_h[i * iteration_stride + id]) /
-                       gamma_reg
-                 : vals_hat_h[i * iteration_stride + id]);
-    }
-    if ((high_index) < row_stride) {
-        __half2 gamma_reg = gamma_h[high_index];
-        vals_arr[iterations] = out_grad_h1[high_index];
-        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
-        vals_hat_arr[iterations] =
-            (invertible ? (vals_hat_h[high_index] - betta_h[high_index]) / gamma_reg
-                        : vals_hat_h[high_index]);
-        iterations++;
-    }
-    __half var_h = vars[row];
-    __half2 var_reg = __halves2half2(var_h, var_h);
-
-    float sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        __half2 result_h = (vals_hat_arr[i] * vals_arr[i] * h2sqrt(var_reg));
-        float2 result_f = __half22float2(result_h);
-        sum += result_f.x;
-        sum += result_f.y;
-        vals_arr[i] *= h2rsqrt(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-    __half2 sum_h = __float2half2_rn(sum);
-
-    for (int i = 0; i < iterations; i++) {
-        __half2 temp = ((-sum_h * vals_hat_arr[i]) / (var_reg));
-        vals_arr_f[i] = __half22float2(vals_arr[i]);
-        float2 temp_f = __half22float2(temp);
-        vals_arr_f[i].x += temp_f.x;
-        vals_arr_f[i].y += temp_f.y;
-    }
-    sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        sum += (vals_arr_f[i].x);
-        sum += (vals_arr_f[i].y);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr_f[i].x -= sum;
-        vals_arr_f[i].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[i]);
-
-        inp_grad_h[i * iteration_stride + id] = temp + out_grad_h2[i * iteration_stride + id];
-    }
-    if ((high_index) < row_stride) {
-        vals_arr_f[iterations].x -= sum;
-        vals_arr_f[iterations].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
-
-        inp_grad_h[high_index] = temp + out_grad_h2[high_index];
-    }
-}
-
-template <>
-void launch_layerNorm_backward_fused_add<float>(const float* out_grad1,
-                                                const float* out_grad2,
-                                                const float* vals_hat,
-                                                const float* vars,
-                                                const float* gamma,
-                                                float* gamma_grad,
-                                                float* betta_grad,
-                                                float* inp_grad,
-                                                int batch,
-                                                int hidden_dim,
-                                                hipStream_t stream[2],
-                                                bool invertible,
-                                                const float* betta)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-   hipLaunchKernelGGL(( LayerNormBackward1<float>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
-        out_grad1, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads);
-   hipLaunchKernelGGL(( LayerNormBackward2_fused_add), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
-        out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim);
-}
-
-template <>
-void launch_layerNorm_backward_fused_add<__half>(const __half* out_grad1,
-                                                 const __half* out_grad2,
-                                                 const __half* vals_hat,
-                                                 const __half* vars,
-                                                 const __half* gamma,
-                                                 __half* gamma_grad,
-                                                 __half* betta_grad,
-                                                 __half* inp_grad,
-                                                 int batch,
-                                                 int hidden_dim,
-                                                 hipStream_t stream[2],
-                                                 bool invertible,
-                                                 const __half* betta)
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-   hipLaunchKernelGGL(( LayerNormBackward1<__half>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
-        out_grad1, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads / 2);
-   hipLaunchKernelGGL(( LayerNormBackward2_fused_add), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
-        out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim / 2);
-}
-
-/* Backward Normalize (Input-Gradient)
- * Using the means and variances from the input
- * This type of backward is not invertible!
- * We do the backward using the input (X)
- */
-
-__global__ void LayerNormBackward2_fused_add(const float* out_grad1,
-                                             const float* out_grad2,
-                                             const float* X_vals,
-                                             const float* gamma,
-                                             const float* vars,
-                                             const float* means,
-                                             float* inp_grad,
-                                             int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    float vals_arr[NORM_REG];
-    float vals_hat_arr[NORM_REG];
-
-    out_grad1 += (row * row_stride);
-    out_grad2 += (row * row_stride);
-    X_vals += (row * row_stride);
-    inp_grad += (row * row_stride);
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        float gamma_reg = gamma[i * iteration_stride + id];
-        vals_arr[i] = out_grad1[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;
-        vals_hat_arr[i] = X_vals[i * iteration_stride + id];
-    }
-    if ((high_index) < row_stride) {
-        float gamma_reg = gamma[high_index];
-        vals_arr[iterations] = out_grad1[high_index];
-        vals_arr[iterations] *= gamma_reg;
-        vals_hat_arr[iterations] = X_vals[high_index];
-        iterations++;
-    }
-
-    float var_reg = vars[row];
-    float mean_reg = means[row];
-
-    float sum = 0;
-    float xu[NORM_REG];
-    for (int i = 0; i < iterations; i++) {
-        xu[i] = (vals_hat_arr[i] - mean_reg);
-        sum += vals_arr[i] * xu[i];
-        vals_arr[i] *= rsqrtf(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    for (int i = 0; i < iterations; i++) {
-        vals_arr[i] += (-sum * xu[i] * rsqrtf(var_reg) / (var_reg));
-    }
-
-    sum = 0;
-    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= row_stride;
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++)
-        inp_grad[i * iteration_stride + id] =
-            (vals_arr[i] - sum) + out_grad2[i * iteration_stride + id];
-    if ((high_index) < row_stride)
-        inp_grad[high_index] = (vals_arr[iterations] - sum) + out_grad2[high_index];
-}
-
-__global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
-                                             const __half* out_grad2,
-                                             const __half* X_vals,
-                                             const __half* gamma,
-                                             const __half* vars,
-                                             const __half* means,
-                                             __half* inp_grad,
-                                             int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int wid = id / WARP_SIZE;
-    int warp_num = iteration_stride >> WARP_SIZE_BITS;
-
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    __half2 vals_arr[NORM_REG];
-    float2 vals_arr_f[NORM_REG];
-    __half2 vals_hat_arr[NORM_REG];
-
-    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
-    const __half2* out_grad_h1 = reinterpret_cast<const __half2*>(out_grad1);
-    const __half2* out_grad_h2 = reinterpret_cast<const __half2*>(out_grad2);
-    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(X_vals);
-
-    out_grad_h1 += (row * row_stride);
-    out_grad_h2 += (row * row_stride);
-    inp_grad_h += (row * row_stride);
-    vals_hat_h += (row * row_stride);
-
-    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
-    int high_index = iterations * iteration_stride + id;
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
-        vals_arr[i] = out_grad_h1[i * iteration_stride + id];
-        vals_arr[i] *= gamma_reg;  // out_grad * gamma
-        vals_hat_arr[i] = vals_hat_h[i * iteration_stride + id];
-    }
-    if ((high_index) < row_stride) {
-        __half2 gamma_reg = gamma_h[high_index];
-        vals_arr[iterations] = out_grad_h1[high_index];
-        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
-        vals_hat_arr[iterations] = vals_hat_h[high_index];
-        iterations++;
-    }
-
-    __half mean_h = means[row];
-    __half var_h = vars[row];
-    __half2 var_reg = __halves2half2(var_h, var_h);
-    __half2 mean_reg = __halves2half2(mean_h, mean_h);
-    __half2 xu[NORM_REG];
-
-    float sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        xu[i] = (vals_hat_arr[i] - mean_reg);
-        __half2 result_h = (xu[i] * vals_arr[i]);
-        float2 result_f = __half22float2(result_h);
-        sum += result_f.x;
-        sum += result_f.y;
-        vals_arr[i] *= h2rsqrt(var_reg);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-    __half2 sum_h = __float2half2_rn(sum);
-
-    for (int i = 0; i < iterations; i++) {
-        __half2 xu_grad = ((-sum_h * xu[i] * h2rsqrt(var_reg)) / (var_reg));
-        vals_arr_f[i] = __half22float2(vals_arr[i]);
-        float2 xu_grad_f = __half22float2(xu_grad);
-        vals_arr_f[i].x += xu_grad_f.x;
-        vals_arr_f[i].y += xu_grad_f.y;
-    }
-
-    sum = 0.f;
-    for (int i = 0; i < iterations; i++) {
-        sum += (vals_arr_f[i].x);
-        sum += (vals_arr_f[i].y);
-    }
-
-    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
-
-    if (g.thread_rank() == 0) partialSum[wid] = sum;
-
-    __syncthreads();
-
-    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
-
-    sum = g.shfl(sum, 0);
-    sum /= (2 * row_stride);
-
-    iterations = row_stride / iteration_stride;
-    for (int i = 0; i < iterations; i++) {
-        vals_arr_f[i].x -= sum;
-        vals_arr_f[i].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[i]);
-        inp_grad_h[i * iteration_stride + id] = temp + out_grad_h2[i * iteration_stride + id];
-    }
-    if ((high_index) < row_stride) {
-        vals_arr_f[iterations].x -= sum;
-        vals_arr_f[iterations].y -= sum;
-        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
-        inp_grad_h[high_index] = temp + out_grad_h2[high_index];
-    }
-}
-
-template <>
-void launch_layerNorm_backward_fused_add<float>(const float* out_grad1,
-                                                const float* out_grad2,
-                                                const float* X_data,
-                                                const float* vars,
-                                                const float* means,
-                                                const float* gamma,
-                                                float* gamma_grad,
-                                                float* betta_grad,
-                                                float* inp_grad,
-                                                int batch,
-                                                int hidden_dim,
-                                                hipStream_t stream[2])
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-   hipLaunchKernelGGL(( LayerNormBackward1<float>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
-        out_grad1, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 1;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 2;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads);
-   hipLaunchKernelGGL(( LayerNormBackward2_fused_add), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
-        out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad, hidden_dim);
-}
-
-template <>
-void launch_layerNorm_backward_fused_add<__half>(const __half* out_grad1,
-                                                 const __half* out_grad2,
-                                                 const __half* X_data,
-                                                 const __half* vars,
-                                                 const __half* means,
-                                                 const __half* gamma,
-                                                 __half* gamma_grad,
-                                                 __half* betta_grad,
-                                                 __half* inp_grad,
-                                                 int batch,
-                                                 int hidden_dim,
-                                                 hipStream_t stream[2])
-{
-    int threads = THREADS;
-
-    dim3 grid_dim(hidden_dim / TILE_DIM);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-   hipLaunchKernelGGL(( LayerNormBackward1<__half>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
-        out_grad1, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
-
-    dim3 grid_dim2(batch);
-
-    if (hidden_dim > 8192 && hidden_dim <= 16384)
-        threads <<= 1;
-    else if (hidden_dim > 16384 && hidden_dim <= 32768)
-        threads <<= 2;
-    else if (hidden_dim > 32768 && hidden_dim <= 65536)
-        threads <<= 3;
-    else if (hidden_dim > 65536)
-        throw std::runtime_error("Unsupport hidden_dim.");
-
-    dim3 block_dim2(threads / 2);
-   hipLaunchKernelGGL(( LayerNormBackward2_fused_add), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
-        out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad, hidden_dim / 2);
-}
diff --git a/deepspeed/ops/csrc/transformer_bak/softmax_kernels.cu b/deepspeed/ops/csrc/transformer_bak/softmax_kernels.cu
deleted file mode 100644
index 34487cba9bbec060baa0fb79922e691076296476..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/softmax_kernels.cu
+++ /dev/null
@@ -1,595 +0,0 @@
-#include <math.h>
-#include "custom_cuda_layers.h"
-#include "general_kernels.h"
-
-namespace cg = cooperative_groups;
-
-dim3 get_attn_softmax_grid(int batch_size, int heads, int sequence_length, int threads)
-{
-    int seq_length4 = sequence_length / 4;
-    int block_compute_size =
-        (seq_length4 < threads ? (int)pow(2.0, floor(log2((float)(threads / seq_length4)))) : 1);
-    // Note that the Y and Z dimensions are limited to 65535, while X is basically unlimited:
-    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
-    // The batch size is typically relatively small, while the sequence length could potentially be
-    // arbitrarily large. We therefore place the batch size second to avoid hitting the Y limit.
-    unsigned x = heads * sequence_length / block_compute_size;
-    unsigned y = batch_size;
-    return {x, y};
-}
-
-// Fused attention + softmax
-template <int tbSize, int blockStride, int tbSeq>
-__global__ void attn_softmax(float* vals,
-                             const float* attn_mask,
-                             int heads,
-                             int seq_length,
-                             int iterations)
-{
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int warp_num = blockDim.x >> WARP_SIZE_BITS;
-
-    int iteration_stride = blockDim.x;
-    int block_width = blockStride * seq_length;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-
-    int batch = blockIdx.y;
-    int row = blockIdx.x;
-    int max_threads_in_sequence = std::max(seq_length, tbSeq);
-    int seq_lane = threadIdx.x % max_threads_in_sequence;
-
-    int data_offset = batch * (gridDim.x * block_width) + row * block_width +
-                      (threadIdx.x / max_threads_in_sequence) * seq_length;
-    int mask_offset = batch * seq_length;
-
-    int wid = threadIdx.x >> WARP_SIZE_BITS;
-    int lane = threadIdx.x & 0x1f;
-
-    float4* val_cast = reinterpret_cast<float4*>(vals);
-    const float4* attn_mask_cast = reinterpret_cast<const float4*>(attn_mask);
-
-    float4 data[MAX_THREAD_ITERATIONS];
-
-    float max_val = minus_infinity;
-
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + seq_lane;
-        if (data_id < seq_length) {
-            float4 mask = attn_mask_cast[mask_offset + data_id];
-            data[i] = val_cast[data_offset + data_id];
-
-            data[i].x += mask.x;
-            data[i].y += mask.y;
-            data[i].z += mask.z;
-            data[i].w += mask.w;
-
-            max_val = (data[i].x > max_val ? data[i].x : max_val);
-            max_val = (data[i].y > max_val ? data[i].y : max_val);
-            max_val = (data[i].z > max_val ? data[i].z : max_val);
-            max_val = (data[i].w > max_val ? data[i].w : max_val);
-        } else {
-            data[i].x = minus_infinity;
-            data[i].y = minus_infinity;
-            data[i].z = minus_infinity;
-            data[i].w = minus_infinity;
-        }
-    }
-
-    for (int i = 1; i < tbSize; i *= 2) {
-        auto temp = g.shfl_xor(max_val, i);
-        max_val = (temp > max_val ? temp : max_val);
-    }
-
-    if (seq_length > tbSize) {
-        if (lane == 0) partialSum[wid] = max_val;
-        b.sync();
-
-        if (lane < warp_num) max_val = partialSum[lane];
-
-#ifndef __STOCHASTIC_MODE__
-        b.sync();
-#endif
-
-        int iters = warp_num;
-        if (seq_length < iteration_stride)
-            iters = warp_num / (iteration_stride / max_threads_in_sequence);
-
-        for (int i = 1; i < iters; i *= 2) {
-            auto temp = g.shfl_xor(max_val, i);
-            max_val = (temp > max_val ? temp : max_val);
-        }
-
-        max_val = g.shfl(max_val, threadIdx.x / tbSize);
-    }
-
-    float sum = 0;
-    for (int i = 0; i < iterations; i++) {
-        data[i].x = __expf(data[i].x - max_val);
-        data[i].y = __expf(data[i].y - max_val);
-        data[i].z = __expf(data[i].z - max_val);
-        data[i].w = __expf(data[i].w - max_val);
-
-        sum += (data[i].x + data[i].y + data[i].z + data[i].w);
-    }
-
-    for (int i = 1; i < tbSize; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-    if (seq_length > tbSize) {
-        if (lane == 0) partialSum[wid] = sum;
-        b.sync();
-
-        if (lane < warp_num) sum = partialSum[lane];
-
-#ifndef __STOCHASTIC_MODE__
-        b.sync();
-#endif
-
-        int iters = warp_num;
-        if (seq_length < iteration_stride)
-            iters = warp_num / (iteration_stride / max_threads_in_sequence);
-
-        for (int i = 1; i < iters; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-        sum = g.shfl(sum, threadIdx.x / tbSize);
-    }
-
-    sum += 1e-6;
-
-    for (int i = 0; i < iterations; i++) {
-        data[i].x /= sum;
-        data[i].y /= sum;
-        data[i].z /= sum;
-        data[i].w /= sum;
-
-        int data_id = i * iteration_stride + seq_lane;
-        if (data_id < seq_length) val_cast[data_offset + data_id] = data[i];
-    }
-}
-
-template <int tbSize, int blockStride, int tbSeq>
-__global__ void attn_softmax(__half* vals,
-                             const __half* attn_mask,
-                             int heads,
-                             int seq_length,
-                             int iterations)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int warp_num = blockDim.x >> WARP_SIZE_BITS;
-
-    int iteration_stride = blockDim.x;
-    int block_width = blockStride * seq_length;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-
-    int batch = blockIdx.y;
-    int row = blockIdx.x;
-    int max_threads_in_sequence = std::max(seq_length, tbSeq);
-    int seq_lane = threadIdx.x % max_threads_in_sequence;
-
-    int data_offset = batch * (gridDim.x * block_width) + row * block_width +
-                      (threadIdx.x / max_threads_in_sequence) * seq_length;
-    int mask_offset = batch * seq_length;
-
-    int wid = threadIdx.x >> WARP_SIZE_BITS;
-    int lane = threadIdx.x & 0x1f;
-
-    float2* val_cast = reinterpret_cast<float2*>(vals);
-    const float2* attn_mask_cast = reinterpret_cast<const float2*>(attn_mask);
-
-    val_cast += data_offset;
-    attn_mask_cast += mask_offset;
-
-    float2 low_data[MAX_THREAD_ITERATIONS];
-    float2 high_data[MAX_THREAD_ITERATIONS];
-
-    float max_val = minus_infinity;
-
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + seq_lane;
-        if (data_id < seq_length) {
-            float2 data = val_cast[data_id];
-            float2 mask = attn_mask_cast[data_id];
-
-            __half2* data_arr = reinterpret_cast<__half2*>(&data);
-            __half2* mask_arr = reinterpret_cast<__half2*>(&mask);
-
-            low_data[i] = __half22float2(data_arr[0]);
-            high_data[i] = __half22float2(data_arr[1]);
-            float2 low_mask = __half22float2(mask_arr[0]);
-            float2 high_mask = __half22float2(mask_arr[1]);
-
-            low_data[i].x += low_mask.x;
-            low_data[i].y += low_mask.y;
-            high_data[i].x += high_mask.x;
-            high_data[i].y += high_mask.y;
-
-            max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
-            max_val = (low_data[i].y > max_val ? low_data[i].y : max_val);
-            max_val = (high_data[i].x > max_val ? high_data[i].x : max_val);
-            max_val = (high_data[i].y > max_val ? high_data[i].y : max_val);
-        }
-    }
-
-    for (int i = 1; i < tbSize; i *= 2) {
-        auto temp = g.shfl_xor(max_val, i);
-        max_val = (temp > max_val ? temp : max_val);
-    }
-
-    if (seq_length > tbSize) {
-        if (lane == 0) partialSum[wid] = max_val;
-        b.sync();
-
-        if (lane < warp_num) max_val = partialSum[lane];
-
-#ifndef __STOCHASTIC_MODE__
-        b.sync();
-#endif
-
-        int iters = warp_num;
-        if (seq_length < iteration_stride)
-            iters = warp_num / (iteration_stride / max_threads_in_sequence);
-
-        for (int i = 1; i < iters; i *= 2) {
-            auto temp = g.shfl_xor(max_val, i);
-            max_val = (temp > max_val ? temp : max_val);
-        }
-
-        max_val = g.shfl(max_val, threadIdx.x / tbSize);
-    }
-
-    float sum = 0;
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + seq_lane;
-        if (data_id < seq_length) {
-            low_data[i].x = __expf(low_data[i].x - max_val);
-            low_data[i].y = __expf(low_data[i].y - max_val);
-            high_data[i].x = __expf(high_data[i].x - max_val);
-            high_data[i].y = __expf(high_data[i].y - max_val);
-
-            sum += (low_data[i].x + low_data[i].y + high_data[i].x + high_data[i].y);
-        }
-    }
-
-    for (int i = 1; i < tbSize; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-    if (seq_length > tbSize) {
-        if (lane == 0) partialSum[wid] = sum;
-        b.sync();
-
-        if (lane < warp_num) sum = partialSum[lane];
-
-#ifndef __STOCHASTIC_MODE__
-        b.sync();
-#endif
-
-        int iters = warp_num;
-        if (seq_length < iteration_stride)
-            iters = warp_num / (iteration_stride / max_threads_in_sequence);
-
-        for (int i = 1; i < iters; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-        sum = g.shfl(sum, threadIdx.x / tbSize);
-    }
-
-    sum += 1e-6;
-
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + seq_lane;
-        if (data_id < seq_length) {
-            float2 result_f;
-            __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-            low_data[i].x /= sum;
-            low_data[i].y /= sum;
-            high_data[i].x /= sum;
-            high_data[i].y /= sum;
-
-            result_h[0] = __float22half2_rn(low_data[i]);
-            result_h[1] = __float22half2_rn(high_data[i]);
-
-            val_cast[data_id] = result_f;
-        }
-    }
-
-#endif
-}
-
-template <typename T>
-void launch_attn_softmax(T*, const T*, int, int, int, cudaStream_t);
-
-template <>
-void launch_attn_softmax<float>(float* vals,
-                                const float* attn_mask,
-                                int batch_size,
-                                int heads,
-                                int sequence_length,
-                                cudaStream_t stream)
-{
-    const int threads = 128;
-    int seq_length4 = sequence_length / 4;
-
-    dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
-
-    int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
-
-    dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
-                                            subblock_max_workload * threads)
-                                         : threads);
-    int iterations =
-        (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
-                                                 : MAX_THREAD_ITERATIONS);
-
-    if (sequence_length <= 8)
-        attn_softmax<2, (threads / 2), 2>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 16)
-        attn_softmax<4, (threads / 4), 4>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 32)
-        attn_softmax<8, (threads / 8), 8>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 64)
-        attn_softmax<16, (threads / 16), 16>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 128)
-        attn_softmax<32, (threads / 32), 32>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 256)
-        attn_softmax<32, (threads / 64), 64>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else {
-        const int threads = 256;
-        dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
-
-        int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
-
-        dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
-                                                subblock_max_workload * threads)
-                                             : threads);
-        iterations =
-            (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
-                                                     : MAX_THREAD_ITERATIONS);
-        if (sequence_length <= 512)
-            attn_softmax<32, (threads / 128), 128><<<grid_dim, block_dim, 0, stream>>>(
-                vals, attn_mask, heads, seq_length4, iterations);
-        else if (sequence_length < (MAX_THREADS * MAX_THREAD_ITERATIONS * 4))
-            attn_softmax<32, 1, 128><<<grid_dim, block_dim, 0, stream>>>(
-                vals, attn_mask, heads, seq_length4, iterations);
-        else
-            throw std::runtime_error(
-                "Unsupport Seq_Length! Check the restriction of the max_threads and "
-                "max_thread_iterations!");
-    }
-}
-
-template <>
-void launch_attn_softmax<__half>(__half* vals,
-                                 const __half* attn_mask,
-                                 int batch_size,
-                                 int heads,
-                                 int sequence_length,
-                                 cudaStream_t stream)
-{
-    const int threads = 128;
-    int seq_length4 = sequence_length / 4;
-
-    dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
-
-    int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
-
-    dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
-                                            subblock_max_workload * threads)
-                                         : threads);
-
-    int iterations =
-        (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
-                                                 : MAX_THREAD_ITERATIONS);
-
-    if (sequence_length <= 8)
-        attn_softmax<2, (threads / 2), 2>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 16)
-        attn_softmax<4, (threads / 4), 4>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 32)
-        attn_softmax<8, (threads / 8), 8>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 64)
-        attn_softmax<16, (threads / 16), 16>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 128)
-        attn_softmax<32, (threads / 32), 32>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 256)
-        attn_softmax<32, (threads / 64), 64>
-            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
-    else {
-        const int threads = 256;
-        dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
-
-        int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
-
-        dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
-                                                subblock_max_workload * threads)
-                                             : threads);
-        iterations =
-            (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
-                                                     : MAX_THREAD_ITERATIONS);
-        if (sequence_length <= 512)
-            attn_softmax<32, (threads / 128), 128><<<grid_dim, block_dim, 0, stream>>>(
-                vals, attn_mask, heads, seq_length4, iterations);
-        else if (sequence_length < (MAX_THREADS * MAX_THREAD_ITERATIONS * 4))
-            attn_softmax<32, 1, 128><<<grid_dim, block_dim, 0, stream>>>(
-                vals, attn_mask, heads, seq_length4, iterations);
-        else
-            throw std::runtime_error(
-                "Unsupport Seq_Length! Check the restriction of the max_threads and "
-                "max_thread_iterations!");
-    }
-}
-
-template <typename T, int tbSize, int blockStride>
-__global__ void softmax_backward_kernel(T* out_grad, const T* soft_inp, int seq_length)
-{
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int warp_num = blockDim.x >> WARP_SIZE_BITS;  // warp-count = num_threads / WARP_SIZE (32)
-
-    int iteration_stride = blockDim.x;
-    int block_width = blockStride * seq_length;
-
-    int iterations = (seq_length < (MAX_THREAD_ITERATIONS * iteration_stride)
-                          ? (seq_length + iteration_stride - 1) / iteration_stride
-                          : MAX_THREAD_ITERATIONS);
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-
-    int wid = id >> WARP_SIZE_BITS;
-    int lane = id & 0x1f;
-
-    T val_reg[MAX_THREAD_ITERATIONS];
-    T soft_reg[MAX_THREAD_ITERATIONS];
-    float grad_reg = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + id;
-        if (data_id < block_width) {
-            val_reg[i] = out_grad[row * block_width + data_id];
-            soft_reg[i] = soft_inp[row * block_width + data_id];
-
-            grad_reg += ((float)val_reg[i] *
-                         (float)soft_reg[i]);  // if done in half, the multiplication, we may lose
-                                               // 2% of accuracy in computation!!
-        }
-    }
-    for (int i = 1; i < tbSize; i *= 2) grad_reg += g.shfl_xor(grad_reg, i);
-
-    if (seq_length > tbSize) {
-        if (lane == 0) partialSum[wid] = grad_reg;
-        b.sync();
-
-        if (lane < warp_num) grad_reg = partialSum[lane];
-
-        int iters = warp_num;
-        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
-
-        for (int i = 1; i < iters; i *= 2) grad_reg += g.shfl_xor(grad_reg, i);
-
-        grad_reg = g.shfl(grad_reg, id / tbSize);
-    }
-
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + id;
-        if (data_id < block_width) {
-            float temp = (float)soft_reg[i] * ((float)val_reg[i] - grad_reg);
-            out_grad[row * block_width + data_id] = (T)temp;
-        }
-    }
-}
-
-template <typename T, int ITERATIONS>
-__global__ void softmax_backward_kernel_v2(T* grad /* input & output*/,
-                                           const T* output,
-                                           int softmax_length)
-{
-    int batch_idx = blockIdx.x * blockDim.y + threadIdx.y;
-    int offset = batch_idx * softmax_length + threadIdx.x;
-
-    grad += offset;
-    output += offset;
-
-    T grad_reg[ITERATIONS];
-    T output_reg[ITERATIONS];
-    float sum = 0.0;
-
-#pragma unroll
-    for (int i = 0; i < ITERATIONS; ++i) {
-        int curr_idx = threadIdx.x + i * WARP_SIZE;
-        if (curr_idx < softmax_length) {
-            grad_reg[i] = grad[i * WARP_SIZE];
-            output_reg[i] = output[i * WARP_SIZE];
-            sum += (float)grad_reg[i] * (float)output_reg[i];
-        }
-    }
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
-
-#pragma unroll
-    for (int i = 0; i < ITERATIONS; ++i) {
-        int curr_idx = threadIdx.x + i * WARP_SIZE;
-        if (curr_idx < softmax_length)
-            grad[i * WARP_SIZE] = (float)output_reg[i] * ((float)grad_reg[i] - sum);
-    }
-}
-
-template <typename T>
-void launch_attn_softmax_backward_v2(T* out_grad,
-                                     const T* soft_inp,
-                                     int batch_size,
-                                     int heads,
-                                     int seq_length,
-                                     cudaStream_t stream)
-{
-    const int warps_per_block = 4;
-    dim3 grid_dim(batch_size * heads * seq_length / warps_per_block);
-    dim3 block_dim(WARP_SIZE, warps_per_block);
-
-    if (seq_length <= 32)
-        softmax_backward_kernel_v2<T, 1>
-            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-    else if (seq_length <= 64)
-        softmax_backward_kernel_v2<T, 2>
-            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-    else if (seq_length <= 128)
-        softmax_backward_kernel_v2<T, 4>
-            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-    else if (seq_length <= 256)
-        softmax_backward_kernel_v2<T, 8>
-            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-    else if (seq_length <= 384)
-        softmax_backward_kernel_v2<T, 12>
-            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-    else if (seq_length <= 512)
-        softmax_backward_kernel_v2<T, 16>
-            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-    else if (seq_length <= 768)
-        softmax_backward_kernel_v2<T, 24>
-            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-    else if (seq_length <= 1024)
-        softmax_backward_kernel_v2<T, 32>
-            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-    else if (seq_length <= 2048)
-        softmax_backward_kernel_v2<T, 64>
-            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-    else
-        throw std::runtime_error(
-            std::string("Special sequence length found in softmax backward, seq_length: ") +
-            std::to_string(seq_length));
-}
-
-template void launch_attn_softmax_backward_v2<__half>(__half* out_grad,
-                                                      const __half* soft_inp,
-                                                      int batch_size,
-                                                      int heads,
-                                                      int seq_length,
-                                                      cudaStream_t stream);
-template void launch_attn_softmax_backward_v2<float>(float* out_grad,
-                                                     const float* soft_inp,
-                                                     int batch_size,
-                                                     int heads,
-                                                     int seq_length,
-                                                     cudaStream_t stream);
diff --git a/deepspeed/ops/csrc/transformer_bak/softmax_kernels.hip b/deepspeed/ops/csrc/transformer_bak/softmax_kernels.hip
deleted file mode 100644
index afe65b0c9cbdc6b10027db2ddd5c7e8f447e0c24..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/transformer_bak/softmax_kernels.hip
+++ /dev/null
@@ -1,597 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include <math.h>
-#include "custom_hip_layers.h"
-#include "general_kernels_hip.h"
-
-namespace cg = cooperative_groups;
-
-dim3 get_attn_softmax_grid(int batch_size, int heads, int sequence_length, int threads)
-{
-    int seq_length4 = sequence_length / 4;
-    int block_compute_size =
-        (seq_length4 < threads ? (int)pow(2.0, floor(log2((float)(threads / seq_length4)))) : 1);
-    // Note that the Y and Z dimensions are limited to 65535, while X is basically unlimited:
-    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
-    // The batch size is typically relatively small, while the sequence length could potentially be
-    // arbitrarily large. We therefore place the batch size second to avoid hitting the Y limit.
-    unsigned x = heads * sequence_length / block_compute_size;
-    unsigned y = batch_size;
-    return {x, y};
-}
-
-// Fused attention + softmax
-template <int tbSize, int blockStride, int tbSeq>
-__global__ void attn_softmax(float* vals,
-                             const float* attn_mask,
-                             int heads,
-                             int seq_length,
-                             int iterations)
-{
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int warp_num = blockDim.x >> WARP_SIZE_BITS;
-
-    int iteration_stride = blockDim.x;
-    int block_width = blockStride * seq_length;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-
-    int batch = blockIdx.y;
-    int row = blockIdx.x;
-    int max_threads_in_sequence = ::max(seq_length, tbSeq);
-    int seq_lane = threadIdx.x % max_threads_in_sequence;
-
-    int data_offset = batch * (gridDim.x * block_width) + row * block_width +
-                      (threadIdx.x / max_threads_in_sequence) * seq_length;
-    int mask_offset = batch * seq_length;
-
-    int wid = threadIdx.x >> WARP_SIZE_BITS;
-    int lane = threadIdx.x & 0x1f;
-
-    float4* val_cast = reinterpret_cast<float4*>(vals);
-    const float4* attn_mask_cast = reinterpret_cast<const float4*>(attn_mask);
-
-    float4 data[MAX_THREAD_ITERATIONS];
-
-    float max_val = minus_infinity;
-
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + seq_lane;
-        if (data_id < seq_length) {
-            float4 mask = attn_mask_cast[mask_offset + data_id];
-            data[i] = val_cast[data_offset + data_id];
-
-            data[i].x += mask.x;
-            data[i].y += mask.y;
-            data[i].z += mask.z;
-            data[i].w += mask.w;
-
-            max_val = (data[i].x > max_val ? data[i].x : max_val);
-            max_val = (data[i].y > max_val ? data[i].y : max_val);
-            max_val = (data[i].z > max_val ? data[i].z : max_val);
-            max_val = (data[i].w > max_val ? data[i].w : max_val);
-        } else {
-            data[i].x = minus_infinity;
-            data[i].y = minus_infinity;
-            data[i].z = minus_infinity;
-            data[i].w = minus_infinity;
-        }
-    }
-
-    for (int i = 1; i < tbSize; i *= 2) {
-        auto temp = g.shfl_xor(max_val, i);
-        max_val = (temp > max_val ? temp : max_val);
-    }
-
-    if (seq_length > tbSize) {
-        if (lane == 0) partialSum[wid] = max_val;
-        b.sync();
-
-        if (lane < warp_num) max_val = partialSum[lane];
-
-#ifndef __STOCHASTIC_MODE__
-        b.sync();
-#endif
-
-        int iters = warp_num;
-        if (seq_length < iteration_stride)
-            iters = warp_num / (iteration_stride / max_threads_in_sequence);
-
-        for (int i = 1; i < iters; i *= 2) {
-            auto temp = g.shfl_xor(max_val, i);
-            max_val = (temp > max_val ? temp : max_val);
-        }
-
-        max_val = g.shfl(max_val, threadIdx.x / tbSize);
-    }
-
-    float sum = 0;
-    for (int i = 0; i < iterations; i++) {
-        data[i].x = __expf(data[i].x - max_val);
-        data[i].y = __expf(data[i].y - max_val);
-        data[i].z = __expf(data[i].z - max_val);
-        data[i].w = __expf(data[i].w - max_val);
-
-        sum += (data[i].x + data[i].y + data[i].z + data[i].w);
-    }
-
-    for (int i = 1; i < tbSize; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-    if (seq_length > tbSize) {
-        if (lane == 0) partialSum[wid] = sum;
-        b.sync();
-
-        if (lane < warp_num) sum = partialSum[lane];
-
-#ifndef __STOCHASTIC_MODE__
-        b.sync();
-#endif
-
-        int iters = warp_num;
-        if (seq_length < iteration_stride)
-            iters = warp_num / (iteration_stride / max_threads_in_sequence);
-
-        for (int i = 1; i < iters; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-        sum = g.shfl(sum, threadIdx.x / tbSize);
-    }
-
-    sum += 1e-6;
-
-    for (int i = 0; i < iterations; i++) {
-        data[i].x /= sum;
-        data[i].y /= sum;
-        data[i].z /= sum;
-        data[i].w /= sum;
-
-        int data_id = i * iteration_stride + seq_lane;
-        if (data_id < seq_length) val_cast[data_offset + data_id] = data[i];
-    }
-}
-
-template <int tbSize, int blockStride, int tbSeq>
-__global__ void attn_softmax(__half* vals,
-                             const __half* attn_mask,
-                             int heads,
-                             int seq_length,
-                             int iterations)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int warp_num = blockDim.x >> WARP_SIZE_BITS;
-
-    int iteration_stride = blockDim.x;
-    int block_width = blockStride * seq_length;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-
-    int batch = blockIdx.y;
-    int row = blockIdx.x;
-    int max_threads_in_sequence = ::max(seq_length, tbSeq);
-    int seq_lane = threadIdx.x % max_threads_in_sequence;
-
-    int data_offset = batch * (gridDim.x * block_width) + row * block_width +
-                      (threadIdx.x / max_threads_in_sequence) * seq_length;
-    int mask_offset = batch * seq_length;
-
-    int wid = threadIdx.x >> WARP_SIZE_BITS;
-    int lane = threadIdx.x & 0x1f;
-
-    float2* val_cast = reinterpret_cast<float2*>(vals);
-    const float2* attn_mask_cast = reinterpret_cast<const float2*>(attn_mask);
-
-    val_cast += data_offset;
-    attn_mask_cast += mask_offset;
-
-    float2 low_data[MAX_THREAD_ITERATIONS];
-    float2 high_data[MAX_THREAD_ITERATIONS];
-
-    float max_val = minus_infinity;
-
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + seq_lane;
-        if (data_id < seq_length) {
-            float2 data = val_cast[data_id];
-            float2 mask = attn_mask_cast[data_id];
-
-            __half2* data_arr = reinterpret_cast<__half2*>(&data);
-            __half2* mask_arr = reinterpret_cast<__half2*>(&mask);
-
-            low_data[i] = __half22float2(data_arr[0]);
-            high_data[i] = __half22float2(data_arr[1]);
-            float2 low_mask = __half22float2(mask_arr[0]);
-            float2 high_mask = __half22float2(mask_arr[1]);
-
-            low_data[i].x += low_mask.x;
-            low_data[i].y += low_mask.y;
-            high_data[i].x += high_mask.x;
-            high_data[i].y += high_mask.y;
-
-            max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
-            max_val = (low_data[i].y > max_val ? low_data[i].y : max_val);
-            max_val = (high_data[i].x > max_val ? high_data[i].x : max_val);
-            max_val = (high_data[i].y > max_val ? high_data[i].y : max_val);
-        }
-    }
-
-    for (int i = 1; i < tbSize; i *= 2) {
-        auto temp = g.shfl_xor(max_val, i);
-        max_val = (temp > max_val ? temp : max_val);
-    }
-
-    if (seq_length > tbSize) {
-        if (lane == 0) partialSum[wid] = max_val;
-        b.sync();
-
-        if (lane < warp_num) max_val = partialSum[lane];
-
-#ifndef __STOCHASTIC_MODE__
-        b.sync();
-#endif
-
-        int iters = warp_num;
-        if (seq_length < iteration_stride)
-            iters = warp_num / (iteration_stride / max_threads_in_sequence);
-
-        for (int i = 1; i < iters; i *= 2) {
-            auto temp = g.shfl_xor(max_val, i);
-            max_val = (temp > max_val ? temp : max_val);
-        }
-
-        max_val = g.shfl(max_val, threadIdx.x / tbSize);
-    }
-
-    float sum = 0;
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + seq_lane;
-        if (data_id < seq_length) {
-            low_data[i].x = __expf(low_data[i].x - max_val);
-            low_data[i].y = __expf(low_data[i].y - max_val);
-            high_data[i].x = __expf(high_data[i].x - max_val);
-            high_data[i].y = __expf(high_data[i].y - max_val);
-
-            sum += (low_data[i].x + low_data[i].y + high_data[i].x + high_data[i].y);
-        }
-    }
-
-    for (int i = 1; i < tbSize; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-    if (seq_length > tbSize) {
-        if (lane == 0) partialSum[wid] = sum;
-        b.sync();
-
-        if (lane < warp_num) sum = partialSum[lane];
-
-#ifndef __STOCHASTIC_MODE__
-        b.sync();
-#endif
-
-        int iters = warp_num;
-        if (seq_length < iteration_stride)
-            iters = warp_num / (iteration_stride / max_threads_in_sequence);
-
-        for (int i = 1; i < iters; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-        sum = g.shfl(sum, threadIdx.x / tbSize);
-    }
-
-    sum += 1e-6;
-
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + seq_lane;
-        if (data_id < seq_length) {
-            float2 result_f;
-            __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
-            low_data[i].x /= sum;
-            low_data[i].y /= sum;
-            high_data[i].x /= sum;
-            high_data[i].y /= sum;
-
-            result_h[0] = __float22half2_rn(low_data[i]);
-            result_h[1] = __float22half2_rn(high_data[i]);
-
-            val_cast[data_id] = result_f;
-        }
-    }
-
-#endif
-}
-
-template <typename T>
-void launch_attn_softmax(T*, const T*, int, int, int, hipStream_t);
-
-template <>
-void launch_attn_softmax<float>(float* vals,
-                                const float* attn_mask,
-                                int batch_size,
-                                int heads,
-                                int sequence_length,
-                                hipStream_t stream)
-{
-    const int threads = 128;
-    int seq_length4 = sequence_length / 4;
-
-    dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
-
-    int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
-
-    dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
-                                            subblock_max_workload * threads)
-                                         : threads);
-    int iterations =
-        (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
-                                                 : MAX_THREAD_ITERATIONS);
-
-    if (sequence_length <= 8)
-       hipLaunchKernelGGL(( attn_softmax<2, (threads / 2), 2>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 16)
-       hipLaunchKernelGGL(( attn_softmax<4, (threads / 4), 4>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 32)
-       hipLaunchKernelGGL(( attn_softmax<8, (threads / 8), 8>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 64)
-       hipLaunchKernelGGL(( attn_softmax<16, (threads / 16), 16>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 128)
-       hipLaunchKernelGGL(( attn_softmax<32, (threads / 32), 32>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 256)
-       hipLaunchKernelGGL(( attn_softmax<32, (threads / 64), 64>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else {
-        const int threads = 256;
-        dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
-
-        int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
-
-        dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
-                                                subblock_max_workload * threads)
-                                             : threads);
-        iterations =
-            (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
-                                                     : MAX_THREAD_ITERATIONS);
-        if (sequence_length <= 512)
-           hipLaunchKernelGGL(( attn_softmax<32, (threads / 128), 128>), dim3(grid_dim), dim3(block_dim), 0, stream, 
-                vals, attn_mask, heads, seq_length4, iterations);
-        else if (sequence_length < (MAX_THREADS * MAX_THREAD_ITERATIONS * 4))
-           hipLaunchKernelGGL(( attn_softmax<32, 1, 128>), dim3(grid_dim), dim3(block_dim), 0, stream, 
-                vals, attn_mask, heads, seq_length4, iterations);
-        else
-            throw std::runtime_error(
-                "Unsupport Seq_Length! Check the restriction of the max_threads and "
-                "max_thread_iterations!");
-    }
-}
-
-template <>
-void launch_attn_softmax<__half>(__half* vals,
-                                 const __half* attn_mask,
-                                 int batch_size,
-                                 int heads,
-                                 int sequence_length,
-                                 hipStream_t stream)
-{
-    const int threads = 128;
-    int seq_length4 = sequence_length / 4;
-
-    dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
-
-    int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
-
-    dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
-                                            subblock_max_workload * threads)
-                                         : threads);
-
-    int iterations =
-        (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
-                                                 : MAX_THREAD_ITERATIONS);
-
-    if (sequence_length <= 8)
-       hipLaunchKernelGGL(( attn_softmax<2, (threads / 2), 2>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 16)
-       hipLaunchKernelGGL(( attn_softmax<4, (threads / 4), 4>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 32)
-       hipLaunchKernelGGL(( attn_softmax<8, (threads / 8), 8>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 64)
-       hipLaunchKernelGGL(( attn_softmax<16, (threads / 16), 16>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 128)
-       hipLaunchKernelGGL(( attn_softmax<32, (threads / 32), 32>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else if (sequence_length <= 256)
-       hipLaunchKernelGGL(( attn_softmax<32, (threads / 64), 64>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
-    else {
-        const int threads = 256;
-        dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
-
-        int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
-
-        dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
-                                                subblock_max_workload * threads)
-                                             : threads);
-        iterations =
-            (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
-                                                     : MAX_THREAD_ITERATIONS);
-        if (sequence_length <= 512)
-           hipLaunchKernelGGL(( attn_softmax<32, (threads / 128), 128>), dim3(grid_dim), dim3(block_dim), 0, stream, 
-                vals, attn_mask, heads, seq_length4, iterations);
-        else if (sequence_length < (MAX_THREADS * MAX_THREAD_ITERATIONS * 4))
-           hipLaunchKernelGGL(( attn_softmax<32, 1, 128>), dim3(grid_dim), dim3(block_dim), 0, stream, 
-                vals, attn_mask, heads, seq_length4, iterations);
-        else
-            throw std::runtime_error(
-                "Unsupport Seq_Length! Check the restriction of the max_threads and "
-                "max_thread_iterations!");
-    }
-}
-
-template <typename T, int tbSize, int blockStride>
-__global__ void softmax_backward_kernel(T* out_grad, const T* soft_inp, int seq_length)
-{
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int warp_num = blockDim.x >> WARP_SIZE_BITS;  // warp-count = num_threads / WARP_SIZE (32)
-
-    int iteration_stride = blockDim.x;
-    int block_width = blockStride * seq_length;
-
-    int iterations = (seq_length < (MAX_THREAD_ITERATIONS * iteration_stride)
-                          ? (seq_length + iteration_stride - 1) / iteration_stride
-                          : MAX_THREAD_ITERATIONS);
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-
-    int wid = id >> WARP_SIZE_BITS;
-    int lane = id & 0x1f;
-
-    T val_reg[MAX_THREAD_ITERATIONS];
-    T soft_reg[MAX_THREAD_ITERATIONS];
-    float grad_reg = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + id;
-        if (data_id < block_width) {
-            val_reg[i] = out_grad[row * block_width + data_id];
-            soft_reg[i] = soft_inp[row * block_width + data_id];
-
-            grad_reg += ((float)val_reg[i] *
-                         (float)soft_reg[i]);  // if done in half, the multiplication, we may lose
-                                               // 2% of accuracy in computation!!
-        }
-    }
-    for (int i = 1; i < tbSize; i *= 2) grad_reg += g.shfl_xor(grad_reg, i);
-
-    if (seq_length > tbSize) {
-        if (lane == 0) partialSum[wid] = grad_reg;
-        b.sync();
-
-        if (lane < warp_num) grad_reg = partialSum[lane];
-
-        int iters = warp_num;
-        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
-
-        for (int i = 1; i < iters; i *= 2) grad_reg += g.shfl_xor(grad_reg, i);
-
-        grad_reg = g.shfl(grad_reg, id / tbSize);
-    }
-
-    for (int i = 0; i < iterations; i++) {
-        int data_id = i * iteration_stride + id;
-        if (data_id < block_width) {
-            float temp = (float)soft_reg[i] * ((float)val_reg[i] - grad_reg);
-            out_grad[row * block_width + data_id] = (T)temp;
-        }
-    }
-}
-
-template <typename T, int ITERATIONS>
-__global__ void softmax_backward_kernel_v2(T* grad /* input & output*/,
-                                           const T* output,
-                                           int softmax_length)
-{
-    int batch_idx = blockIdx.x * blockDim.y + threadIdx.y;
-    int offset = batch_idx * softmax_length + threadIdx.x;
-
-    grad += offset;
-    output += offset;
-
-    T grad_reg[ITERATIONS];
-    T output_reg[ITERATIONS];
-    float sum = 0.0;
-
-#pragma unroll
-    for (int i = 0; i < ITERATIONS; ++i) {
-        int curr_idx = threadIdx.x + i * WARP_SIZE;
-        if (curr_idx < softmax_length) {
-            grad_reg[i] = grad[i * WARP_SIZE];
-            output_reg[i] = output[i * WARP_SIZE];
-            sum += (float)grad_reg[i] * (float)output_reg[i];
-        }
-    }
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
-
-#pragma unroll
-    for (int i = 0; i < ITERATIONS; ++i) {
-        int curr_idx = threadIdx.x + i * WARP_SIZE;
-        if (curr_idx < softmax_length)
-            grad[i * WARP_SIZE] = (float)output_reg[i] * ((float)grad_reg[i] - sum);
-    }
-}
-
-template <typename T>
-void launch_attn_softmax_backward_v2(T* out_grad,
-                                     const T* soft_inp,
-                                     int batch_size,
-                                     int heads,
-                                     int seq_length,
-                                     hipStream_t stream)
-{
-    const int warps_per_block = 4;
-    dim3 grid_dim(batch_size * heads * seq_length / warps_per_block);
-    dim3 block_dim(WARP_SIZE, warps_per_block);
-
-    if (seq_length <= 32)
-       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 1>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
-    else if (seq_length <= 64)
-       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 2>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
-    else if (seq_length <= 128)
-       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 4>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
-    else if (seq_length <= 256)
-       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 8>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
-    else if (seq_length <= 384)
-       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 12>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
-    else if (seq_length <= 512)
-       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 16>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
-    else if (seq_length <= 768)
-       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 24>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
-    else if (seq_length <= 1024)
-       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 32>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
-    else if (seq_length <= 2048)
-       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 64>)
-            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
-    else
-        throw std::runtime_error(
-            std::string("Special sequence length found in softmax backward, seq_length: ") +
-            std::to_string(seq_length));
-}
-
-template void launch_attn_softmax_backward_v2<__half>(__half* out_grad,
-                                                      const __half* soft_inp,
-                                                      int batch_size,
-                                                      int heads,
-                                                      int seq_length,
-                                                      hipStream_t stream);
-template void launch_attn_softmax_backward_v2<float>(float* out_grad,
-                                                     const float* soft_inp,
-                                                     int batch_size,
-                                                     int heads,
-                                                     int seq_length,
-                                                     hipStream_t stream);
diff --git a/deepspeed/ops/csrc/utils/flatten_unflatten.cpp b/deepspeed/ops/csrc/utils/flatten_unflatten.cpp
deleted file mode 100644
index 7d16c5c00a5f285a5320aa09bdf74de4aa5d2611..0000000000000000000000000000000000000000
--- a/deepspeed/ops/csrc/utils/flatten_unflatten.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
-   Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
-*/
-
-#include <torch/csrc/utils/tensor_flatten.h>
-#include <torch/extension.h>
-// https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_flatten.h
-
-at::Tensor flatten(std::vector<at::Tensor> tensors)
-{
-    return torch::utils::flatten_dense_tensors(tensors);
-}
-
-std::vector<at::Tensor> unflatten(at::Tensor flat, std::vector<at::Tensor> tensors)
-{
-    return torch::utils::unflatten_dense_tensors(flat, tensors);
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("flatten", &flatten, "Flatten dense tensors");
-    m.def("unflatten", &unflatten, "Unflatten dense tensors");
-}
diff --git a/deepspeed/ops/lamb/__init__.py b/deepspeed/ops/lamb/__init__.py
index 9d3448dd68eadde881eda8c68c1a8033d6e5586e..942d76687a601f36091004dca089ded7cd7ee6a0 100644
--- a/deepspeed/ops/lamb/__init__.py
+++ b/deepspeed/ops/lamb/__init__.py
@@ -1 +1,3 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from .fused_lamb import FusedLamb
diff --git a/deepspeed/ops/lamb/fused_lamb.py b/deepspeed/ops/lamb/fused_lamb.py
index e9210cdda9bcfecbb2ed357f0f2938fa2ecb511e..33a1461e9883345e6afd17df7dddc6eed56c2be3 100644
--- a/deepspeed/ops/lamb/fused_lamb.py
+++ b/deepspeed/ops/lamb/fused_lamb.py
@@ -6,7 +6,7 @@ This file is adapted from NVIDIA/apex/optimizer/fused_adam and implements the LA
 '''
 import types
 import torch
-from ..op_builder import FusedLambBuilder
+from deepspeed.ops.op_builder import FusedLambBuilder
 
 
 class FusedLamb(torch.optim.Optimizer):
diff --git a/deepspeed/ops/op_builder b/deepspeed/ops/op_builder
new file mode 120000
index 0000000000000000000000000000000000000000..db4f9c3350652dff87a26f6bb95c024160842a99
--- /dev/null
+++ b/deepspeed/ops/op_builder
@@ -0,0 +1 @@
+../../op_builder
\ No newline at end of file
diff --git a/deepspeed/ops/op_builder/__init__.py b/deepspeed/ops/op_builder/__init__.py
deleted file mode 100644
index dcac71011aa8be21d81f2b55f9692fc99d6b1211..0000000000000000000000000000000000000000
--- a/deepspeed/ops/op_builder/__init__.py
+++ /dev/null
@@ -1,32 +0,0 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
-from .cpu_adam import CPUAdamBuilder
-from .cpu_adagrad import CPUAdagradBuilder
-from .fused_adam import FusedAdamBuilder
-from .fused_lamb import FusedLambBuilder
-from .sparse_attn import SparseAttnBuilder
-from .transformer import TransformerBuilder
-from .stochastic_transformer import StochasticTransformerBuilder
-from .utils import UtilsBuilder
-from .async_io import AsyncIOBuilder
-from .transformer_inference import InferenceBuilder
-from .quantizer import QuantizerBuilder
-from .builder import get_default_compute_capabilities, OpBuilder
-
-# TODO: infer this list instead of hard coded
-# List of all available ops
-__op_builders__ = [
-    CPUAdamBuilder(),
-    CPUAdagradBuilder(),
-    FusedAdamBuilder(),
-    FusedLambBuilder(),
-    SparseAttnBuilder(),
-    TransformerBuilder(),
-    StochasticTransformerBuilder(),
-    AsyncIOBuilder(),
-    UtilsBuilder(),
-    QuantizerBuilder(),
-    InferenceBuilder()
-]
-ALL_OPS = {op.name: op for op in __op_builders__}
diff --git a/deepspeed/ops/op_builder/async_io.py b/deepspeed/ops/op_builder/async_io.py
deleted file mode 100644
index aec7911ce96bff070f3b2cc65f5fa58170a2cb86..0000000000000000000000000000000000000000
--- a/deepspeed/ops/op_builder/async_io.py
+++ /dev/null
@@ -1,106 +0,0 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
-import distutils.spawn
-import subprocess
-
-from .builder import OpBuilder
-
-
-class AsyncIOBuilder(OpBuilder):
-    BUILD_VAR = "DS_BUILD_AIO"
-    NAME = "async_io"
-
-    def __init__(self):
-        super().__init__(name=self.NAME)
-
-    def absolute_name(self):
-        return f'deepspeed.ops.aio.{self.NAME}_op'
-
-    def sources(self):
-        return [
-            'csrc/aio/py_lib/deepspeed_py_copy.cpp',
-            'csrc/aio/py_lib/py_ds_aio.cpp',
-            'csrc/aio/py_lib/deepspeed_py_aio.cpp',
-            'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
-            'csrc/aio/py_lib/deepspeed_aio_thread.cpp',
-            'csrc/aio/common/deepspeed_aio_utils.cpp',
-            'csrc/aio/common/deepspeed_aio_common.cpp',
-            'csrc/aio/common/deepspeed_aio_types.cpp'
-        ]
-
-    def include_paths(self):
-        return ['csrc/aio/py_lib', 'csrc/aio/common']
-
-    def cxx_args(self):
-        # -O0 for improved debugging, since performance is bound by I/O
-        CPU_ARCH = self.cpu_arch()
-        SIMD_WIDTH = self.simd_width()
-        return [
-            '-g',
-            '-Wall',
-            '-O0',
-            '-std=c++14',
-            '-shared',
-            '-fPIC',
-            '-Wno-reorder',
-            CPU_ARCH,
-            '-fopenmp',
-            SIMD_WIDTH,
-            '-laio',
-        ]
-
-    def extra_ldflags(self):
-        return ['-laio']
-
-    def check_for_libaio_pkg(self):
-        libs = dict(
-            dpkg=["-l",
-                  "libaio-dev",
-                  "apt"],
-            pacman=["-Q",
-                    "libaio",
-                    "pacman"],
-            rpm=["-q",
-                 "libaio-devel",
-                 "yum"],
-        )
-
-        found = False
-        for pkgmgr, data in libs.items():
-            flag, lib, tool = data
-            path = distutils.spawn.find_executable(pkgmgr)
-            if path is not None:
-                cmd = f"{pkgmgr} {flag} {lib}"
-                result = subprocess.Popen(cmd,
-                                          stdout=subprocess.PIPE,
-                                          stderr=subprocess.PIPE,
-                                          shell=True)
-                if result.wait() == 0:
-                    found = True
-                else:
-                    self.warning(
-                        f"{self.NAME}: please install the {lib} package with {tool}")
-                break
-        return found
-
-    def is_compatible(self, verbose=True):
-        # Check for the existence of libaio by using distutils
-        # to compile and link a test program that calls io_submit,
-        # which is a function provided by libaio that is used in the async_io op.
-        # If needed, one can define -I and -L entries in CFLAGS and LDFLAGS
-        # respectively to specify the directories for libaio.h and libaio.so.
-        aio_compatible = self.has_function('io_submit', ('aio', ))
-        if verbose and not aio_compatible:
-            self.warning(
-                f"{self.NAME} requires the dev libaio .so object and headers but these were not found."
-            )
-
-            # Check for the libaio package via known package managers
-            # to print suggestions on which package to install.
-            self.check_for_libaio_pkg()
-
-            self.warning(
-                "If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found."
-            )
-        return super().is_compatible(verbose) and aio_compatible
diff --git a/deepspeed/ops/op_builder/builder.py b/deepspeed/ops/op_builder/builder.py
deleted file mode 100644
index 8c5396fb3fb74ca782aa62caeaa8e23027017db5..0000000000000000000000000000000000000000
--- a/deepspeed/ops/op_builder/builder.py
+++ /dev/null
@@ -1,699 +0,0 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
-import os
-import sys
-import time
-import json
-import importlib
-from pathlib import Path
-import subprocess
-import shlex
-import shutil
-import tempfile
-import distutils.ccompiler
-import distutils.log
-import distutils.sysconfig
-from distutils.errors import CompileError, LinkError
-from abc import ABC, abstractmethod
-
-YELLOW = '\033[93m'
-END = '\033[0m'
-WARNING = f"{YELLOW} [WARNING] {END}"
-
-DEFAULT_TORCH_EXTENSION_PATH = "/tmp/torch_extensions"
-DEFAULT_COMPUTE_CAPABILITIES = "6.0;6.1;7.0"
-
-try:
-    import torch
-except ImportError:
-    print(
-        f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops."
-    )
-else:
-    TORCH_MAJOR = int(torch.__version__.split('.')[0])
-    TORCH_MINOR = int(torch.__version__.split('.')[1])
-
-
-def installed_cuda_version():
-    import torch.utils.cpp_extension
-    cuda_home = torch.utils.cpp_extension.CUDA_HOME
-    assert cuda_home is not None, "CUDA_HOME does not exist, unable to compile CUDA op(s)"
-    # Ensure there is not a cuda version mismatch between torch and nvcc compiler
-    output = subprocess.check_output([cuda_home + "/bin/nvcc",
-                                      "-V"],
-                                     universal_newlines=True)
-    output_split = output.split()
-    release_idx = output_split.index("release")
-    release = output_split[release_idx + 1].replace(',', '').split(".")
-    # Ignore patch versions, only look at major + minor
-    cuda_major, cuda_minor = release[:2]
-    installed_cuda_version = ".".join(release[:2])
-    return int(cuda_major), int(cuda_minor)
-
-
-def get_default_compute_capabilities():
-    compute_caps = DEFAULT_COMPUTE_CAPABILITIES
-    import torch.utils.cpp_extension
-    if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version(
-    )[0] >= 11:
-        if installed_cuda_version()[0] == 11 and installed_cuda_version()[1] == 0:
-            # Special treatment of CUDA 11.0 because compute_86 is not supported.
-            compute_caps += ";8.0"
-        else:
-            compute_caps += ";8.0;8.6"
-    return compute_caps
-
-
-# list compatible minor CUDA versions - so that for example pytorch built with cuda-11.0 can be used
-# to build deepspeed and system-wide installed cuda 11.2
-cuda_minor_mismatch_ok = {
-    10: [
-        "10.0",
-        "10.1",
-        "10.2",
-    ],
-    11: [
-        "11.0",
-        "11.1",
-        "11.2",
-        "11.3",
-        "11.4",
-        "11.5",
-        "11.6",
-    ],
-}
-
-
-def assert_no_cuda_mismatch():
-    cuda_major, cuda_minor = installed_cuda_version()
-    sys_cuda_version = f'{cuda_major}.{cuda_minor}'
-    torch_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
-    # This is a show-stopping error, should probably not proceed past this
-    if sys_cuda_version != torch_cuda_version:
-        if (cuda_major in cuda_minor_mismatch_ok
-                and sys_cuda_version in cuda_minor_mismatch_ok[cuda_major]
-                and torch_cuda_version in cuda_minor_mismatch_ok[cuda_major]):
-            print(f"Installed CUDA version {sys_cuda_version} does not match the "
-                  f"version torch was compiled with {torch.version.cuda} "
-                  "but since the APIs are compatible, accepting this combination")
-            return
-        raise Exception(
-            f"Installed CUDA version {sys_cuda_version} does not match the "
-            f"version torch was compiled with {torch.version.cuda}, unable to compile "
-            "cuda/cpp extensions without a matching cuda version.")
-
-
-class OpBuilder(ABC):
-    _rocm_version = None
-    _is_rocm_pytorch = None
-
-    def __init__(self, name):
-        self.name = name
-        self.jit_mode = False
-
-    @abstractmethod
-    def absolute_name(self):
-        '''
-        Returns absolute build path for cases where the op is pre-installed, e.g., deepspeed.ops.adam.cpu_adam
-        will be installed as something like: deepspeed/ops/adam/cpu_adam.so
-        '''
-        pass
-
-    @abstractmethod
-    def sources(self):
-        '''
-        Returns list of source files for your op, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
-        '''
-        pass
-
-    def hipify_extension(self):
-        pass
-
-    @staticmethod
-    def assert_torch_info(torch_info):
-        install_torch_version = torch_info['version']
-        install_cuda_version = torch_info['cuda_version']
-        install_hip_version = torch_info['hip_version']
-
-        if not OpBuilder.is_rocm_pytorch():
-            current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
-        else:
-            current_hip_version = ".".join(torch.version.hip.split('.')[:2])
-
-        current_torch_version = ".".join(torch.__version__.split('.')[:2])
-
-        if not OpBuilder.is_rocm_pytorch():
-            if install_cuda_version != current_cuda_version or install_torch_version != current_torch_version:
-                raise RuntimeError(
-                    "PyTorch and CUDA version mismatch! DeepSpeed ops were compiled and installed "
-                    "with a different version than what is being used at runtime. Please re-install "
-                    f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
-                    f"torch={install_torch_version}, cuda={install_cuda_version}, runtime versions:"
-                    f"torch={current_torch_version}, cuda={current_cuda_version}")
-        else:
-            if install_hip_version != current_hip_version or install_torch_version != current_torch_version:
-                raise RuntimeError(
-                    "PyTorch and HIP version mismatch! DeepSpeed ops were compiled and installed "
-                    "with a different version than what is being used at runtime. Please re-install "
-                    f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
-                    f"torch={install_torch_version}, hip={install_hip_version}, runtime versions:"
-                    f"torch={current_torch_version}, hip={current_hip_version}")
-
-    @staticmethod
-    def is_rocm_pytorch():
-        if OpBuilder._is_rocm_pytorch is not None:
-            return OpBuilder._is_rocm_pytorch
-
-        _is_rocm_pytorch = False
-        try:
-            import torch
-        except ImportError:
-            pass
-        else:
-            if TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 5):
-                _is_rocm_pytorch = hasattr(torch.version,
-                                           'hip') and torch.version.hip is not None
-                if _is_rocm_pytorch:
-                    from torch.utils.cpp_extension import ROCM_HOME
-                    _is_rocm_pytorch = ROCM_HOME is not None
-        OpBuilder._is_rocm_pytorch = _is_rocm_pytorch
-        return OpBuilder._is_rocm_pytorch
-
-    @staticmethod
-    def installed_rocm_version():
-        if OpBuilder._rocm_version:
-            return OpBuilder._rocm_version
-
-        ROCM_MAJOR = '0'
-        ROCM_MINOR = '0'
-        if OpBuilder.is_rocm_pytorch():
-            from torch.utils.cpp_extension import ROCM_HOME
-            #with open('/opt/rocm/.info/version-dev', 'r') as file:
-            with open('/opt/dtk-22.04/.info/version-dev', 'r') as file:
-                ROCM_VERSION_DEV_RAW = file.read()
-            ROCM_MAJOR = ROCM_VERSION_DEV_RAW.split('.')[0]
-            ROCM_MINOR = ROCM_VERSION_DEV_RAW.split('.')[1]
-        OpBuilder._rocm_version = (int(ROCM_MAJOR), int(ROCM_MINOR))
-        return OpBuilder._rocm_version
-
-    def include_paths(self):
-        '''
-        Returns list of include paths, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
-        '''
-        return []
-
-    def nvcc_args(self):
-        '''
-        Returns optional list of compiler flags to forward to nvcc when building CUDA sources
-        '''
-        return []
-
-    def cxx_args(self):
-        '''
-        Returns optional list of compiler flags to forward to the build
-        '''
-        return []
-
-    def is_compatible(self, verbose=True):
-        '''
-        Check if all non-python dependencies are satisfied to build this op
-        '''
-        return True
-
-    def extra_ldflags(self):
-        return []
-
-    def libraries_installed(self, libraries):
-        valid = False
-        check_cmd = 'dpkg -l'
-        for lib in libraries:
-            result = subprocess.Popen(f'dpkg -l {lib}',
-                                      stdout=subprocess.PIPE,
-                                      stderr=subprocess.PIPE,
-                                      shell=True)
-            valid = valid or result.wait() == 0
-        return valid
-
-    def has_function(self, funcname, libraries, verbose=False):
-        '''
-        Test for existence of a function within a tuple of libraries.
-
-        This is used as a smoke test to check whether a certain library is available.
-        As a test, this creates a simple C program that calls the specified function,
-        and then distutils is used to compile that program and link it with the specified libraries.
-        Returns True if both the compile and link are successful, False otherwise.
-        '''
-        tempdir = None  # we create a temporary directory to hold various files
-        filestderr = None  # handle to open file to which we redirect stderr
-        oldstderr = None  # file descriptor for stderr
-        try:
-            # Echo compile and link commands that are used.
-            if verbose:
-                distutils.log.set_verbosity(1)
-
-            # Create a compiler object.
-            compiler = distutils.ccompiler.new_compiler(verbose=verbose)
-
-            # Configure compiler and linker to build according to Python install.
-            distutils.sysconfig.customize_compiler(compiler)
-
-            # Create a temporary directory to hold test files.
-            tempdir = tempfile.mkdtemp()
-
-            # Define a simple C program that calls the function in question
-            prog = "void %s(void); int main(int argc, char** argv) { %s(); return 0; }" % (
-                funcname,
-                funcname)
-
-            # Write the test program to a file.
-            filename = os.path.join(tempdir, 'test.c')
-            with open(filename, 'w') as f:
-                f.write(prog)
-
-            # Redirect stderr file descriptor to a file to silence compile/link warnings.
-            if not verbose:
-                filestderr = open(os.path.join(tempdir, 'stderr.txt'), 'w')
-                oldstderr = os.dup(sys.stderr.fileno())
-                os.dup2(filestderr.fileno(), sys.stderr.fileno())
-
-            # Workaround for behavior in distutils.ccompiler.CCompiler.object_filenames()
-            # Otherwise, a local directory will be used instead of tempdir
-            drive, driveless_filename = os.path.splitdrive(filename)
-            root_dir = driveless_filename[0] if os.path.isabs(driveless_filename) else ''
-            output_dir = os.path.join(drive, root_dir)
-
-            # Attempt to compile the C program into an object file.
-            cflags = shlex.split(os.environ.get('CFLAGS', ""))
-            objs = compiler.compile([filename],
-                                    output_dir=output_dir,
-                                    extra_preargs=self.strip_empty_entries(cflags))
-
-            # Attempt to link the object file into an executable.
-            # Be sure to tack on any libraries that have been specified.
-            ldflags = shlex.split(os.environ.get('LDFLAGS', ""))
-            compiler.link_executable(objs,
-                                     os.path.join(tempdir,
-                                                  'a.out'),
-                                     extra_preargs=self.strip_empty_entries(ldflags),
-                                     libraries=libraries)
-
-            # Compile and link succeeded
-            return True
-
-        except CompileError:
-            return False
-
-        except LinkError:
-            return False
-
-        except:
-            return False
-
-        finally:
-            # Restore stderr file descriptor and close the stderr redirect file.
-            if oldstderr is not None:
-                os.dup2(oldstderr, sys.stderr.fileno())
-            if filestderr is not None:
-                filestderr.close()
-
-            # Delete the temporary directory holding the test program and stderr files.
-            if tempdir is not None:
-                shutil.rmtree(tempdir)
-
-    def strip_empty_entries(self, args):
-        '''
-        Drop any empty strings from the list of compile and link flags
-        '''
-        return [x for x in args if len(x) > 0]
-
-    def cpu_arch(self):
-        try:
-            from cpuinfo import get_cpu_info
-        except ImportError as e:
-            cpu_info = self._backup_cpuinfo()
-            if cpu_info is None:
-                return "-march=native"
-
-        try:
-            cpu_info = get_cpu_info()
-        except Exception as e:
-            self.warning(
-                f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
-                "falling back to `lscpu` to get this information.")
-            cpu_info = self._backup_cpuinfo()
-            if cpu_info is None:
-                return "-march=native"
-
-        if cpu_info['arch'].startswith('PPC_'):
-            # gcc does not provide -march on PowerPC, use -mcpu instead
-            return '-mcpu=native'
-        return '-march=native'
-
-    def _backup_cpuinfo(self):
-        # Construct cpu_info dict from lscpu that is similar to what py-cpuinfo provides
-        if not self.command_exists('lscpu'):
-            self.warning(
-                f"{self.name} attempted to query 'lscpu' after failing to use py-cpuinfo "
-                "to detect the CPU architecture. 'lscpu' does not appear to exist on "
-                "your system, will fall back to use -march=native and non-vectorized execution."
-            )
-            return None
-        result = subprocess.check_output('lscpu', shell=True)
-        result = result.decode('utf-8').strip().lower()
-
-        cpu_info = {}
-        cpu_info['arch'] = None
-        cpu_info['flags'] = ""
-        if 'genuineintel' in result or 'authenticamd' in result:
-            cpu_info['arch'] = 'X86_64'
-            if 'avx512' in result:
-                cpu_info['flags'] += 'avx512,'
-            if 'avx2' in result:
-                cpu_info['flags'] += 'avx2'
-        elif 'ppc64le' in result:
-            cpu_info['arch'] = "PPC_"
-
-        return cpu_info
-
-    def simd_width(self):
-        try:
-            from cpuinfo import get_cpu_info
-        except ImportError as e:
-            cpu_info = self._backup_cpuinfo()
-            if cpu_info is None:
-                return '-D__SCALAR__'
-
-        try:
-            cpu_info = get_cpu_info()
-        except Exception as e:
-            self.warning(
-                f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
-                "falling back to `lscpu` to get this information.")
-            cpu_info = self._backup_cpuinfo()
-            if cpu_info is None:
-                return '-D__SCALAR__'
-
-        if cpu_info['arch'] == 'X86_64':
-            if 'avx512' in cpu_info['flags']:
-                return '-D__AVX512__'
-            elif 'avx2' in cpu_info['flags']:
-                return '-D__AVX256__'
-        return '-D__SCALAR__'
-
-    def python_requirements(self):
-        '''
-        Override if op wants to define special dependencies, otherwise will
-        take self.name and load requirements-<op-name>.txt if it exists.
-        '''
-        path = f'requirements/requirements-{self.name}.txt'
-        requirements = []
-        if os.path.isfile(path):
-            with open(path, 'r') as fd:
-                requirements = [r.strip() for r in fd.readlines()]
-        return requirements
-
-    def command_exists(self, cmd):
-        if '|' in cmd:
-            cmds = cmd.split("|")
-        else:
-            cmds = [cmd]
-        valid = False
-        for cmd in cmds:
-            result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
-            valid = valid or result.wait() == 0
-
-        if not valid and len(cmds) > 1:
-            print(
-                f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!"
-            )
-        elif not valid and len(cmds) == 1:
-            print(
-                f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!"
-            )
-        return valid
-
-    def warning(self, msg):
-        print(f"{WARNING} {msg}")
-
-    def deepspeed_src_path(self, code_path):
-        if os.path.isabs(code_path):
-            return code_path
-        else:
-            return os.path.join(Path(__file__).parent.parent.absolute(), code_path)
-
-    def builder(self):
-        from torch.utils.cpp_extension import CppExtension
-        return CppExtension(
-            name=self.absolute_name(),
-            sources=self.strip_empty_entries(self.sources()),
-            include_dirs=self.strip_empty_entries(self.include_paths()),
-            extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())},
-            extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
-
-    def load(self, verbose=True):
-        from ...git_version_info import installed_ops, torch_info
-        if installed_ops[self.name]:
-            # Ensure the op we're about to load was compiled with the same
-            # torch/cuda versions we are currently using at runtime.
-            if isinstance(self, CUDAOpBuilder):
-                self.assert_torch_info(torch_info)
-
-            return importlib.import_module(self.absolute_name())
-        else:
-            return self.jit_load(verbose)
-
-    def jit_load(self, verbose=True):
-        if not self.is_compatible(verbose):
-            raise RuntimeError(
-                f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue."
-            )
-        try:
-            import ninja
-        except ImportError:
-            raise RuntimeError(
-                f"Unable to JIT load the {self.name} op due to ninja not being installed."
-            )
-
-        if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
-            assert_no_cuda_mismatch()
-
-        self.jit_mode = True
-        from torch.utils.cpp_extension import load
-
-        # Ensure directory exists to prevent race condition in some cases
-        ext_path = os.path.join(
-            os.environ.get('TORCH_EXTENSIONS_DIR',
-                           DEFAULT_TORCH_EXTENSION_PATH),
-            self.name)
-        os.makedirs(ext_path, exist_ok=True)
-
-        start_build = time.time()
-        sources = [self.deepspeed_src_path(path) for path in self.sources()]
-        extra_include_paths = [
-            self.deepspeed_src_path(path) for path in self.include_paths()
-        ]
-
-        # Torch will try and apply whatever CCs are in the arch list at compile time,
-        # we have already set the intended targets ourselves we know that will be
-        # needed at runtime. This prevents CC collisions such as multiple __half
-        # implementations. Stash arch list to reset after build.
-        torch_arch_list = None
-        if "TORCH_CUDA_ARCH_LIST" in os.environ:
-            torch_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST")
-            os.environ["TORCH_CUDA_ARCH_LIST"] = ""
-
-        op_module = load(
-            name=self.name,
-            sources=self.strip_empty_entries(sources),
-            extra_include_paths=self.strip_empty_entries(extra_include_paths),
-            extra_cflags=self.strip_empty_entries(self.cxx_args()),
-            extra_cuda_cflags=self.strip_empty_entries(self.nvcc_args()),
-            extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
-            verbose=verbose)
-        build_duration = time.time() - start_build
-        if verbose:
-            print(f"Time to load {self.name} op: {build_duration} seconds")
-
-        # Reset arch list so we are not silently removing it for other possible use cases
-        if torch_arch_list:
-            os.environ["TORCH_CUDA_ARCH_LIST"] = torch_arch_list
-
-        return op_module
-
-
-class CUDAOpBuilder(OpBuilder):
-    def compute_capability_args(self, cross_compile_archs=None):
-        """
-        Returns nvcc compute capability compile flags.
-
-        1. `TORCH_CUDA_ARCH_LIST` takes priority over `cross_compile_archs`.
-        2. If neither is set default compute capabilities will be used
-        3. Under `jit_mode` compute capabilities of all visible cards will be used plus PTX
-
-        Format:
-
-        - `TORCH_CUDA_ARCH_LIST` may use ; or whitespace separators. Examples:
-
-        TORCH_CUDA_ARCH_LIST="6.1;7.5;8.6" pip install ...
-        TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX" pip install ...
-
-        - `cross_compile_archs` uses ; separator.
-
-        """
-        ccs = []
-        if self.jit_mode:
-            # Compile for underlying architectures since we know those at runtime
-            for i in range(torch.cuda.device_count()):
-                CC_MAJOR, CC_MINOR = torch.cuda.get_device_capability(i)
-                cc = f"{CC_MAJOR}.{CC_MINOR}"
-                if cc not in ccs:
-                    ccs.append(cc)
-            ccs = sorted(ccs)
-            ccs[-1] += '+PTX'
-        else:
-            # Cross-compile mode, compile for various architectures
-            # env override takes priority
-            cross_compile_archs_env = os.environ.get('TORCH_CUDA_ARCH_LIST', None)
-            if cross_compile_archs_env is not None:
-                if cross_compile_archs is not None:
-                    print(
-                        f"{WARNING} env var `TORCH_CUDA_ARCH_LIST={cross_compile_archs_env}` overrides `cross_compile_archs={cross_compile_archs}`"
-                    )
-                cross_compile_archs = cross_compile_archs_env.replace(' ', ';')
-            else:
-                if cross_compile_archs is None:
-                    cross_compile_archs = get_default_compute_capabilities()
-            ccs = cross_compile_archs.split(';')
-
-        args = []
-        for cc in ccs:
-            num = cc[0] + cc[2]
-            args.append(f'-gencode=arch=compute_{num},code=sm_{num}')
-            if cc.endswith('+PTX'):
-                args.append(f'-gencode=arch=compute_{num},code=compute_{num}')
-
-        return args
-
-    def version_dependent_macros(self):
-        # Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456
-        version_ge_1_1 = []
-        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
-            version_ge_1_1 = ['-DVERSION_GE_1_1']
-        version_ge_1_3 = []
-        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
-            version_ge_1_3 = ['-DVERSION_GE_1_3']
-        version_ge_1_5 = []
-        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
-            version_ge_1_5 = ['-DVERSION_GE_1_5']
-        return version_ge_1_1 + version_ge_1_3 + version_ge_1_5
-
-    def is_compatible(self, verbose=True):
-        return super().is_compatible(verbose)
-
-    def builder(self):
-        from torch.utils.cpp_extension import CUDAExtension
-        if not self.is_rocm_pytorch():
-            assert_no_cuda_mismatch()
-        cuda_ext = CUDAExtension(
-            name=self.absolute_name(),
-            sources=self.strip_empty_entries(self.sources()),
-            include_dirs=self.strip_empty_entries(self.include_paths()),
-            libraries=self.strip_empty_entries(self.libraries_args()),
-            extra_compile_args={
-                'cxx': self.strip_empty_entries(self.cxx_args()),
-                'nvcc': self.strip_empty_entries(self.nvcc_args())
-            })
-        if self.is_rocm_pytorch():
-            # hip converts paths to absolute, this converts back to relative
-            sources = cuda_ext.sources
-            curr_file = Path(__file__).parent.parent  # ds root
-            for i in range(len(sources)):
-                src = Path(sources[i])
-                sources[i] = str(src.relative_to(curr_file))
-            cuda_ext.sources = sources
-        return cuda_ext
-
-    def hipify_extension(self):
-        if self.is_rocm_pytorch():
-            from torch.utils.hipify import hipify_python
-            hipify_python.hipify(
-                project_directory=os.getcwd(),
-                output_directory=os.getcwd(),
-                header_include_dirs=self.include_paths(),
-                includes=[os.path.join(os.getcwd(),
-                                       '*')],
-                extra_files=[os.path.abspath(s) for s in self.sources()],
-                show_detailed=True,
-                is_pytorch_extension=True,
-                hipify_extra_files_only=True,
-            )
-
-    def cxx_args(self):
-        if sys.platform == "win32":
-            return ['-O2']
-        else:
-            return ['-O3', '-std=c++14', '-g', '-Wno-reorder']
-
-    def nvcc_args(self):
-        args = ['-O3']
-        if self.is_rocm_pytorch():
-            ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
-            args += [
-                '-std=c++14',
-                '-U__HIP_NO_HALF_OPERATORS__',
-                '-U__HIP_NO_HALF_CONVERSIONS__',
-                '-U__HIP_NO_HALF2_OPERATORS__',
-                '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
-                '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
-            ]
-        else:
-            cuda_major, _ = installed_cuda_version()
-            args += [
-                '--use_fast_math',
-                '-std=c++17'
-                if sys.platform == "win32" and cuda_major > 10 else '-std=c++14',
-                '-U__CUDA_NO_HALF_OPERATORS__',
-                '-U__CUDA_NO_HALF_CONVERSIONS__',
-                '-U__CUDA_NO_HALF2_OPERATORS__'
-            ]
-            args += self.compute_capability_args()
-        return args
-
-    def libraries_args(self):
-        if sys.platform == "win32":
-            return ['cublas', 'curand']
-        else:
-            return []
-
-
-class TorchCPUOpBuilder(CUDAOpBuilder):
-    def extra_ldflags(self):
-        if not self.is_rocm_pytorch():
-            return ['-lcurand']
-        else:
-            return []
-
-    def cxx_args(self):
-        import torch
-        if not self.is_rocm_pytorch():
-            CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
-        else:
-            CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.ROCM_HOME, "lib")
-        CPU_ARCH = self.cpu_arch()
-        SIMD_WIDTH = self.simd_width()
-
-        args = super().cxx_args()
-        ###aiss debug0506###########
-        args += [
-            f'-L{CUDA_LIB64}',
-            #'-lcudart',
-            #'-lcublas',
-            '-lrocblas',
-            '-lhipblas',
-            '-g',
-            CPU_ARCH,
-            '-fopenmp',
-            SIMD_WIDTH,
-        ]
-        return args
diff --git a/deepspeed/ops/op_builder/cpu_adagrad.py b/deepspeed/ops/op_builder/cpu_adagrad.py
deleted file mode 100644
index 24f0ff3ff52505c11c61a188f34a2b15e68fc5bc..0000000000000000000000000000000000000000
--- a/deepspeed/ops/op_builder/cpu_adagrad.py
+++ /dev/null
@@ -1,44 +0,0 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
-import os
-from .builder import TorchCPUOpBuilder
-
-
-class CPUAdagradBuilder(TorchCPUOpBuilder):
-    BUILD_VAR = "DS_BUILD_CPU_ADAGRAD"
-    NAME = "cpu_adagrad"
-
-    def __init__(self):
-        super().__init__(name=self.NAME)
-
-    def absolute_name(self):
-        return f'deepspeed.ops.adagrad.{self.NAME}_op'
-
-    def sources(self):
-        return ['csrc/adagrad/cpu_adagrad.cpp', 'csrc/common/custom_cuda_kernel.cu']
-
-    def include_paths(self):
-        import torch
-        if not self.is_rocm_pytorch():
-            CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
-        else:
-            CUDA_INCLUDE = [
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
-                             "include"),
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
-                             "include",
-                             "rocrand"),
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
-                             "include",
-                             "hiprand"),
-            ]
-        return ['csrc/includes'] + CUDA_INCLUDE
-
-   ##aiss add
-    def cxx_args(self):
-
-        return [
-            '-lrocblas',
-            '-lhipblas',
-            '-D__HIP_DEVICE_COMPILE__',]
diff --git a/deepspeed/ops/op_builder/cpu_adam.py b/deepspeed/ops/op_builder/cpu_adam.py
deleted file mode 100644
index 35a2a8a17d54b3f60a70e2ce8fee32488372a75c..0000000000000000000000000000000000000000
--- a/deepspeed/ops/op_builder/cpu_adam.py
+++ /dev/null
@@ -1,53 +0,0 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
-import os
-from .builder import TorchCPUOpBuilder
-
-
-class CPUAdamBuilder(TorchCPUOpBuilder):
-    BUILD_VAR = "DS_BUILD_CPU_ADAM"
-    NAME = "cpu_adam"
-
-    def __init__(self):
-        super().__init__(name=self.NAME)
-
-    def absolute_name(self):
-        return f'deepspeed.ops.adam.{self.NAME}_op'
-
-    def sources(self):
-        return ['csrc/adam/cpu_adam.cpp', 'csrc/common/custom_cuda_kernel.cu']
-
-    def libraries_args(self):
-        args = super().libraries_args()
-        #args += ['curand']
-    #aiss debug 0506########
-        args += ['hiprand']
-        args += ['rocrand']
-        return args
-
-    def include_paths(self):
-        import torch
-        if not self.is_rocm_pytorch():
-            CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
-        else:
-            CUDA_INCLUDE = [
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
-                             "include"),
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
-                             "include",
-                             "rocrand"),
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
-                             "include",
-                             "hiprand"),
-            ]
-        return ['csrc/includes'] + CUDA_INCLUDE
-
-   ##aiss add
-    def cxx_args(self):
-
-        return [
-            '-lrocblas',
-            '-lhipblas',
-            '-D__HIP_DEVICE_COMPILE__',
-        ]
diff --git a/deepspeed/ops/op_builder/fused_adam.py b/deepspeed/ops/op_builder/fused_adam.py
deleted file mode 100644
index 6ff264fbf1a1089a2d35afc520f98d9d7548f924..0000000000000000000000000000000000000000
--- a/deepspeed/ops/op_builder/fused_adam.py
+++ /dev/null
@@ -1,32 +0,0 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
-from .builder import CUDAOpBuilder
-
-
-class FusedAdamBuilder(CUDAOpBuilder):
-    BUILD_VAR = "DS_BUILD_FUSED_ADAM"
-    NAME = "fused_adam"
-
-    def __init__(self):
-        super().__init__(name=self.NAME)
-
-    def absolute_name(self):
-        return f'deepspeed.ops.adam.{self.NAME}_op'
-
-    def sources(self):
-        return ['csrc/adam/fused_adam_frontend.cpp', 'csrc/adam/multi_tensor_adam.cu']
-
-    def include_paths(self):
-        return ['csrc/includes', 'csrc/adam']
-
-    def cxx_args(self):
-        args = super().cxx_args()
-        return args + self.version_dependent_macros()
-
-    def nvcc_args(self):
-        nvcc_flags = ['-O3'] + self.version_dependent_macros()
-        if not self.is_rocm_pytorch():
-            nvcc_flags.extend(['-lineinfo',
-                               '--use_fast_math'] + self.compute_capability_args())
-        return nvcc_flags
diff --git a/deepspeed/ops/op_builder/fused_lamb.py b/deepspeed/ops/op_builder/fused_lamb.py
deleted file mode 100644
index 106728f6f3fe9e6449deeb0228410e50cfc8648a..0000000000000000000000000000000000000000
--- a/deepspeed/ops/op_builder/fused_lamb.py
+++ /dev/null
@@ -1,38 +0,0 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
-from .builder import CUDAOpBuilder
-
-
-class FusedLambBuilder(CUDAOpBuilder):
-    BUILD_VAR = 'DS_BUILD_FUSED_LAMB'
-    NAME = "fused_lamb"
-
-    def __init__(self):
-        super().__init__(name=self.NAME)
-
-    def absolute_name(self):
-        return f'deepspeed.ops.lamb.{self.NAME}_op'
-
-    def sources(self):
-        return ['csrc/lamb/fused_lamb_cuda.cpp', 'csrc/lamb/fused_lamb_cuda_kernel.cu']
-
-    def include_paths(self):
-        return ['csrc/includes']
-
-    def cxx_args(self):
-        args = super().cxx_args()
-        return args + self.version_dependent_macros()
-
-    def nvcc_args(self):
-        nvcc_flags = ['-O3'] + self.version_dependent_macros()
-        if self.is_rocm_pytorch():
-            ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
-            nvcc_flags += [
-                '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
-                '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
-            ]
-        else:
-            nvcc_flags.extend(['-lineinfo',
-                               '--use_fast_math'] + self.compute_capability_args())
-        return nvcc_flags
diff --git a/deepspeed/ops/op_builder/quantizer.py b/deepspeed/ops/op_builder/quantizer.py
deleted file mode 100644
index 43bc5778ea20fbb658bcde6c6f96d0fc5f840e3b..0000000000000000000000000000000000000000
--- a/deepspeed/ops/op_builder/quantizer.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from .builder import CUDAOpBuilder
-
-
-class QuantizerBuilder(CUDAOpBuilder):
-    BUILD_VAR = "DS_BUILD_QUANTIZER"
-    NAME = "quantizer"
-
-    def __init__(self, name=None):
-        name = self.NAME if name is None else name
-        super().__init__(name=name)
-
-    def absolute_name(self):
-        return f'deepspeed.ops.quantizer.{self.NAME}_op'
-
-    def sources(self):
-        return [
-            'csrc/quantization/pt_binding.cpp',
-            'csrc/quantization/quantizer.cu',
-        ]
-
-    def include_paths(self):
-        return ['csrc/includes']
diff --git a/deepspeed/ops/op_builder/sparse_attn.py b/deepspeed/ops/op_builder/sparse_attn.py
deleted file mode 100644
index 004fdd698200f4c8d47831297a5a3306352e4bef..0000000000000000000000000000000000000000
--- a/deepspeed/ops/op_builder/sparse_attn.py
+++ /dev/null
@@ -1,87 +0,0 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
-import warnings
-from .builder import OpBuilder
-
-try:
-    from packaging import version as pkg_version
-except ImportError:
-    pkg_version = None
-
-
-class SparseAttnBuilder(OpBuilder):
-    BUILD_VAR = "DS_BUILD_SPARSE_ATTN"
-    NAME = "sparse_attn"
-
-    def __init__(self):
-        super().__init__(name=self.NAME)
-
-    def absolute_name(self):
-        return f'deepspeed.ops.sparse_attention.{self.NAME}_op'
-
-    def sources(self):
-        return ['csrc/sparse_attention/utils.cpp']
-
-    def cxx_args(self):
-        return ['-O2', '-fopenmp']
-
-    def is_compatible(self, verbose=True):
-        # Check to see if llvm and cmake are installed since they are dependencies
-        #required_commands = ['llvm-config|llvm-config-9', 'cmake']
-        #command_status = list(map(self.command_exists, required_commands))
-        #deps_compatible = all(command_status)
-
-#####aiss debug 0506##############
-        if self.is_rocm_pytorch():
-        #    self.warning(f'{self.NAME} is not compatible with ROCM')
-        #    return False
-            return True
-        try:
-            import torch
-        except ImportError:
-            self.warning(f"unable to import torch, please install it first")
-            return False
-
-        # torch-cpu will not have a cuda version
-        if torch.version.cuda is None:
-            cuda_compatible = False
-            self.warning(f"{self.NAME} cuda is not available from torch")
-        else:
-            major, minor = torch.version.cuda.split('.')[:2]
-            cuda_compatible = (int(major) == 10
-                               and int(minor) >= 1) or (int(major) >= 11)
-            if not cuda_compatible:
-                self.warning(f"{self.NAME} requires CUDA version 10.1+")
-
-        TORCH_MAJOR = int(torch.__version__.split('.')[0])
-        TORCH_MINOR = int(torch.__version__.split('.')[1])
-        torch_compatible = TORCH_MAJOR == 1 and TORCH_MINOR >= 5
-        if not torch_compatible:
-            self.warning(
-                f'{self.NAME} requires a torch version >= 1.5 but detected {TORCH_MAJOR}.{TORCH_MINOR}'
-            )
-
-        try:
-            import triton
-        except ImportError:
-            # auto-install of triton is broken on some systems, reverting to manual install for now
-            # see this issue: https://github.com/microsoft/DeepSpeed/issues/1710
-            self.warning(
-                f"please install triton==1.0.0 if you want to use sparse attention")
-            return False
-
-        if pkg_version:
-            installed_triton = pkg_version.parse(triton.__version__)
-            triton_mismatch = installed_triton != pkg_version.parse("1.0.0")
-        else:
-            installed_triton = triton.__version__
-            triton_mismatch = installed_triton != "1.0.0"
-
-        if triton_mismatch:
-            self.warning(
-                f"using untested triton version ({installed_triton}), only 1.0.0 is known to be compatible"
-            )
-            return False
-
-        return super().is_compatible(verbose) and torch_compatible and cuda_compatible
diff --git a/deepspeed/ops/op_builder/stochastic_transformer.py b/deepspeed/ops/op_builder/stochastic_transformer.py
deleted file mode 100644
index aa47c13c49e4b64a0a30c089825104ae075bbbbb..0000000000000000000000000000000000000000
--- a/deepspeed/ops/op_builder/stochastic_transformer.py
+++ /dev/null
@@ -1,20 +0,0 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
-from .transformer import TransformerBuilder
-
-
-class StochasticTransformerBuilder(TransformerBuilder):
-    BUILD_VAR = "DS_BUILD_STOCHASTIC_TRANSFORMER"
-    NAME = "stochastic_transformer"
-
-    def __init__(self):
-        super().__init__(name=self.NAME)
-
-    def absolute_name(self):
-        return f'deepspeed.ops.transformer.{self.NAME}_op'
-
-    def nvcc_args(self):
-        args = super().nvcc_args()
-        args.append('-D__STOCHASTIC_MODE__')
-        return args
diff --git a/deepspeed/ops/op_builder/transformer_inference.py b/deepspeed/ops/op_builder/transformer_inference.py
deleted file mode 100644
index 23eab4886e80e4026e738d17411e54a9f68448d7..0000000000000000000000000000000000000000
--- a/deepspeed/ops/op_builder/transformer_inference.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from .builder import CUDAOpBuilder
-
-
-class InferenceBuilder(CUDAOpBuilder):
-    BUILD_VAR = "DS_BUILD_TRANSFORMER_INFERENCE"
-    NAME = "transformer_inference"
-
-    def __init__(self, name=None):
-        name = self.NAME if name is None else name
-        super().__init__(name=name)
-
-    def absolute_name(self):
-        return f'deepspeed.ops.transformer.inference.{self.NAME}_op'
-
-    def sources(self):
-        return [
-            'csrc/transformer/inference/csrc/pt_binding.cpp',
-            'csrc/transformer/inference/csrc/gelu.cu',
-            'csrc/transformer/inference/csrc/normalize.cu',
-            'csrc/transformer/inference/csrc/softmax.cu',
-            'csrc/transformer/inference/csrc/dequantize.cu',
-            'csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu',
-        ]
-
-    def extra_ldflags(self):
-        if not self.is_rocm_pytorch():
-            return ['-lcurand']
-        else:
-            return []
-
-    def include_paths(self):
-        return ['csrc/transformer/inference/includes']
diff --git a/deepspeed/ops/op_builder/utils.py b/deepspeed/ops/op_builder/utils.py
deleted file mode 100644
index 02d4daa41680aaeaa9bac923b9c79e704b2c2b17..0000000000000000000000000000000000000000
--- a/deepspeed/ops/op_builder/utils.py
+++ /dev/null
@@ -1,18 +0,0 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
-from .builder import OpBuilder
-
-
-class UtilsBuilder(OpBuilder):
-    BUILD_VAR = "DS_BUILD_UTILS"
-    NAME = "utils"
-
-    def __init__(self):
-        super().__init__(name=self.NAME)
-
-    def absolute_name(self):
-        return f'deepspeed.ops.{self.NAME}_op'
-
-    def sources(self):
-        return ['csrc/utils/flatten_unflatten.cpp']
diff --git a/deepspeed/ops/quantizer/__init__.py b/deepspeed/ops/quantizer/__init__.py
index 0bf4045a4afd77c4c190374e2cb626c8c58982ec..922256532d43e9264dfe2fb0edc61ec7f380dae1 100644
--- a/deepspeed/ops/quantizer/__init__.py
+++ b/deepspeed/ops/quantizer/__init__.py
@@ -1 +1,3 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from .quantizer import ds_quantizer
diff --git a/deepspeed/ops/quantizer/quantizer.py b/deepspeed/ops/quantizer/quantizer.py
old mode 100644
new mode 100755
index cea9434049d9a5947416be872180597366d3fe3f..31cfa96d60c323aedc760710cd6298f480691998
--- a/deepspeed/ops/quantizer/quantizer.py
+++ b/deepspeed/ops/quantizer/quantizer.py
@@ -1,14 +1,9 @@
 '''
 Copyright 2020 The Microsoft DeepSpeed Team
 '''
-import json
-import math
-import importlib
 import torch
-from torch import nn
-from torch.autograd import Function
 
-from ..op_builder import QuantizerBuilder
+from deepspeed.ops.op_builder import QuantizerBuilder
 
 # Cuda modules will be imported if needed
 quantizer_cuda_module = None
diff --git a/deepspeed/ops/random_ltd/__init__.py b/deepspeed/ops/random_ltd/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..34b0dd30e294302ba73c2a2dc91e865644e2d056
--- /dev/null
+++ b/deepspeed/ops/random_ltd/__init__.py
@@ -0,0 +1,3 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .dropping_utils import gpt_sample_tokens, bert_sample_tokens, GatherTokens, ScatterTokens
diff --git a/deepspeed/ops/random_ltd/dropping_utils.py b/deepspeed/ops/random_ltd/dropping_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..102ffe13d9d187a3a1bcbfb7db2d24ebaaabbab3
--- /dev/null
+++ b/deepspeed/ops/random_ltd/dropping_utils.py
@@ -0,0 +1,145 @@
+"""
+Copyright 2022 The Microsoft DeepSpeed Team
+"""
+import torch
+
+from deepspeed.ops.op_builder import RandomLTDBuilder
+"""
+Returns:
+    sampled_indices: [layers, batch_size, reserved_length]
+    new_mask: [batch_size, 1, reserved_length, reserved_length]
+"""
+
+random_ltd_module = None
+
+
+def gpt_sample_tokens(reserved_length: int,
+                      seq_length: int,
+                      batch_size: int,
+                      layers: int = 1,
+                      device: str = 'cpu',
+                      attn_mask: torch.Tensor = None):
+
+    prob_dist = torch.ones((layers * batch_size, seq_length), device=device)
+    sampled_indices = torch.multinomial(prob_dist, reserved_length)
+
+    sampled_indices = sampled_indices.reshape(layers,
+                                              batch_size,
+                                              reserved_length).to(torch.int32)
+    global random_ltd_module
+    if random_ltd_module is None:
+        random_ltd_module = RandomLTDBuilder().load()
+    sampled_indices = random_ltd_module.token_sort_(sampled_indices, seq_length)
+
+    # Not certain the optimized kernel is actually better here, cause it kind of screws
+    # with alignment right if the sequence length is not divisble by like 16
+    # new_mask = random_ltd_module.mask_gather_gpt(attn_mask, reserved_length)
+    if attn_mask is not None:
+        new_mask = attn_mask[:, :, :reserved_length, :reserved_length]
+    else:
+        new_mask = None
+
+    return sampled_indices, new_mask
+
+
+"""
+Returns:
+    sampled_indices: [layers, batch_size, reserved_length]
+    new_mask: [layers, batch_size, 1, reserved_length, reserved_length]
+"""
+
+
+def bert_sample_tokens(reserved_length: int,
+                       seq_length: int,
+                       batch_size: int,
+                       layers: int = 1,
+                       device: str = 'cpu',
+                       attn_mask: torch.Tensor = None):
+    assert attn_mask is not None
+    prob_dist = torch.ones((layers * batch_size, seq_length), device=device)
+    sampled_indices = torch.multinomial(prob_dist, reserved_length)
+
+    sampled_indices = sampled_indices.reshape(layers,
+                                              batch_size,
+                                              reserved_length).to(torch.int32)
+    global random_ltd_module
+    if random_ltd_module is None:
+        random_ltd_module = RandomLTDBuilder().load()
+
+    sampled_indices = random_ltd_module.token_sort_(sampled_indices, seq_length)
+    dtype = sampled_indices.dtype
+
+    sampled_indices = sampled_indices.to(torch.long)
+    new_mask = []
+    for l in range(layers):
+        tmp_mask_list = []
+        for i in range(batch_size):
+            mask_tmp = attn_mask[i:i + 1, :, sampled_indices[l][i], :]
+            tmp_mask_list.append(mask_tmp[:, :, :, sampled_indices[l][i]])
+        new_mask.append(torch.cat(tmp_mask_list, dim=0))
+
+    return sampled_indices.to(dtype), new_mask
+
+
+class GatherTokens(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx,
+                activations: torch.Tensor,
+                sorted_indices: torch.Tensor,
+                batch_first: bool):
+        global random_ltd_module
+        if random_ltd_module is None:
+            random_ltd_module = RandomLTDBuilder().load()
+        ctx.save_for_backward(activations, sorted_indices)
+        ctx.batch_first = batch_first
+        return activations, random_ltd_module.token_gather(activations, sorted_indices, batch_first)
+
+    @staticmethod
+    def backward(ctx, a_gradients: torch.Tensor, g_gradients: torch.Tensor):
+
+        g_gradients = g_gradients.contiguous()
+        global random_ltd_module
+        if random_ltd_module is None:
+            random_ltd_module = RandomLTDBuilder().load()
+        activations, sorted_indices = ctx.saved_tensors
+        batch_first = ctx.batch_first
+
+        return random_ltd_module.token_scatter_(a_gradients,
+                                                g_gradients,
+                                                sorted_indices,
+                                                batch_first), None, None
+
+
+class ScatterTokens(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx,
+                all_activations: torch.Tensor,
+                layer_activations: torch.Tensor,
+                sorted_indices: torch.Tensor,
+                batch_first: bool):
+        global random_ltd_module
+        if random_ltd_module is None:
+            random_ltd_module = RandomLTDBuilder().load()
+        scatter_results = random_ltd_module.token_scatter_(all_activations.clone(),
+                                                           layer_activations,
+                                                           sorted_indices,
+                                                           batch_first)
+
+        ctx.save_for_backward(sorted_indices)
+        ctx.batch_first = batch_first
+        return scatter_results
+
+    @staticmethod
+    def backward(ctx, out_gradients: torch.Tensor):
+
+        out_gradients = out_gradients.contiguous()
+        global random_ltd_module
+        if random_ltd_module is None:
+            random_ltd_module = RandomLTDBuilder().load()
+        sorted_indices, = ctx.saved_tensors
+        batch_first = ctx.batch_first
+
+        ret_val = random_ltd_module.token_gather(out_gradients,
+                                                 sorted_indices,
+                                                 batch_first)
+        return out_gradients, ret_val, None, None
diff --git a/deepspeed/ops/sparse_attention/__init__.py b/deepspeed/ops/sparse_attention/__init__.py
index b7e1db35845eb3ad2df367434dc7d40c2c6cadc4..870d73768d1d45535d02e477a5633cc8d16e8e00 100644
--- a/deepspeed/ops/sparse_attention/__init__.py
+++ b/deepspeed/ops/sparse_attention/__init__.py
@@ -1,4 +1,6 @@
-from .sparsity_config import SparsityConfig, DenseSparsityConfig, FixedSparsityConfig, VariableSparsityConfig, BigBirdSparsityConfig, BSLongformerSparsityConfig
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .sparsity_config import SparsityConfig, DenseSparsityConfig, FixedSparsityConfig, VariableSparsityConfig, BigBirdSparsityConfig, BSLongformerSparsityConfig, LocalSlidingWindowSparsityConfig
 from .sparse_self_attention import SparseSelfAttention
 from .bert_sparse_self_attention import BertSparseSelfAttention
 from .sparse_attention_utils import SparseAttentionUtils
diff --git a/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py b/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py
old mode 100644
new mode 100755
diff --git a/deepspeed/ops/sparse_attention/matmul.py b/deepspeed/ops/sparse_attention/matmul.py
old mode 100644
new mode 100755
index ea83f093c74844a52111e59f5568397f47f7e1cd..17b0898fdd0abf558e9c5cf03fd390b6fa22db2f
--- a/deepspeed/ops/sparse_attention/matmul.py
+++ b/deepspeed/ops/sparse_attention/matmul.py
@@ -1,13 +1,14 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 # DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
 # https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/matmul.py
 import importlib
-import warnings
 import torch
-import math
 
 import triton
 import triton.language as tl
 import triton._C.libtriton as libtriton
+from deepspeed.accelerator import get_accelerator
 
 
 @triton.jit
@@ -339,8 +340,8 @@ class _sparse_matmul(torch.autograd.Function):
         a_inner, b_inner = a.shape[a_dim], b.shape[b_dim]
         if a_inner != b_inner:
             raise ValueError(
-                f"Size of tensor A along the {_dim_to_name(a_dim)} dim ({a_inner}) must match size "
-                f"of tensor B along the {_dim_to_name(b_dim)} dim ({b_inner})")
+                f"Size of tensor A along the {a_dim} dim ({a_inner}) must match size "
+                f"of tensor B along the {b_dim} dim ({b_inner})")
         if a_inner % 16 != 0:
             raise ValueError('Reduction size for SDD must be a multiple of 16')
 
@@ -950,7 +951,7 @@ class MatMul:
             raise ValueError(
                 f"Inputs must be on the same device; got {a.device} for tensor A "
                 f"and {b.device} for tensor B")
-        if not a.is_cuda:
+        if not get_accelerator().on_accelerator(a):
             raise ValueError("Only GPU devices are supported for now")
 
         # When autocast is enabled, torch.matmul autocasts to float16, so we do the same here
diff --git a/deepspeed/ops/sparse_attention/softmax.py b/deepspeed/ops/sparse_attention/softmax.py
old mode 100644
new mode 100755
index 11d4583fd619a60a1658c21f5b51ca798301a422..09560e103d16b9eaa456e47d3cb5c701bcae21e3
--- a/deepspeed/ops/sparse_attention/softmax.py
+++ b/deepspeed/ops/sparse_attention/softmax.py
@@ -1,14 +1,12 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 # DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
 # https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/matmul.py
 
-import warnings
-import importlib
 import torch
-import math
 
 import triton
 import triton.language as tl
-import triton._C.libtriton as libtriton
 
 
 def next_power_of_2(n):
diff --git a/deepspeed/ops/sparse_attention/sparse_attention_utils.py b/deepspeed/ops/sparse_attention/sparse_attention_utils.py
index 700363b204af9beca40d9bf284a8c35d3f022c7a..90edb10fd84a9835e5afe3b0c15cf3e6491a462f 100644
--- a/deepspeed/ops/sparse_attention/sparse_attention_utils.py
+++ b/deepspeed/ops/sparse_attention/sparse_attention_utils.py
@@ -2,7 +2,7 @@
 Copyright 2020 The Microsoft DeepSpeed Team
 """
 
-from torch import nn
+import torch
 from torch.nn import functional as F
 from deepspeed.ops.sparse_attention import BertSparseSelfAttention, SparsityConfig
 '''
@@ -102,13 +102,13 @@ class SparseAttentionUtils:
 
         if hasattr(model, 'bert'):
             model.config.max_position_embeddings = max_position
-            replace_self_attention_layer_with_sparse_self_attention_layer(
+            model.replace_self_attention_layer_with_sparse_self_attention_layer(
                 model.config,
                 model.bert.encoder.layer,
                 sparsity_config)
         elif hasattr(model, 'roberta'):
             model.config.max_position_embeddings = max_position + 2
-            replace_self_attention_layer_with_sparse_self_attention_layer(
+            model.replace_self_attention_layer_with_sparse_self_attention_layer(
                 model.config,
                 model.roberta.encoder.layer,
                 sparsity_config)
@@ -155,7 +155,7 @@ class SparseAttentionUtils:
                           position_ids,
                           inputs_embeds,
                           pad_token_id,
-                          model_mbeddings):
+                          model_embeddings):
         """This function pads input tokens and attention mask on sequence length dimension to be multiple of block size.
             This is a requirement for Sparse Transformer in which the self attention layer works on sequences of length multiple of block size.
             It needs to be called in your model, such as BertModel, right before you calculate the embedding outputs.
diff --git a/deepspeed/ops/sparse_attention/sparse_self_attention.py b/deepspeed/ops/sparse_attention/sparse_self_attention.py
index 5ccfa13198fd8109c1fc181f26fe05c928082746..46dedd0779f9e4e67d4a69f392bf41c5f33caa2f 100644
--- a/deepspeed/ops/sparse_attention/sparse_self_attention.py
+++ b/deepspeed/ops/sparse_attention/sparse_self_attention.py
@@ -3,10 +3,8 @@ Copyright 2020 The Microsoft DeepSpeed Team
 """
 
 import torch.nn as nn
-from torch.nn.functional import *
 import torch
 from torch import distributed as dist
-from collections import namedtuple
 from deepspeed.ops.sparse_attention import SparsityConfig
 
 
diff --git a/deepspeed/ops/sparse_attention/sparsity_config.py b/deepspeed/ops/sparse_attention/sparsity_config.py
index 3b9224f136cb4fcac9de2a5739aa69c55e7881d3..efdad2e4d7e534c36b29dc1ddbaea87a69b97354 100644
--- a/deepspeed/ops/sparse_attention/sparsity_config.py
+++ b/deepspeed/ops/sparse_attention/sparsity_config.py
@@ -429,7 +429,8 @@ class BigBirdSparsityConfig(SparsityConfig):
                  different_layout_per_head=False,
                  num_random_blocks=1,
                  num_sliding_window_blocks=3,
-                 num_global_blocks=1):
+                 num_global_blocks=1,
+                 attention='bidirectional'):
         """Initialize the BigBird Sparsity Pattern Config.
 
         For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial
@@ -441,6 +442,7 @@ class BigBirdSparsityConfig(SparsityConfig):
              num_random_blocks: optional: an integer determining the number of random blocks in each block row.
              num_sliding_window_blocks: optional: an integer determining the number of blocks in sliding local attention window.
              num_global_blocks: optional: an integer determining how many consecutive blocks, starting from index 0, are considered as global attention. Global block tokens will be attended by all other block tokens and will attend to all other block tokens as well.
+             attention: optional: a string determining attention type. Attention can be `unidirectional`, such as autoregressive models, in which tokens attend only to tokens appear before them in the context. Considering that, the upper triangular of attention matrix is empty as above figure. Or it can be `bidirectional`, such as BERT, in which tokens can attend to any other tokens before or after them. Then, the upper triangular part of the attention matrix is mirror of the lower triangular in the above figure.
         """
 
         super().__init__(num_heads, block, different_layout_per_head)
@@ -449,6 +451,11 @@ class BigBirdSparsityConfig(SparsityConfig):
         self.num_sliding_window_blocks = num_sliding_window_blocks
         self.num_global_blocks = num_global_blocks
 
+        if (attention != 'unidirectional' and attention != 'bidirectional'):
+            raise NotImplementedError(
+                'only \"uni/bi-directional\" attentions are supported for now!')
+        self.attention = attention
+
     def set_random_layout(self, h, layout):
         """Sets random attention layout used by the given head in the sparse attention.
         Note) By default, it assumes there will be a unique random block layout for all heads; unless `different_layout_per_head` parameter is set in which each head can have a different random layout.
@@ -468,7 +475,11 @@ class BigBirdSparsityConfig(SparsityConfig):
             )
 
         for row in range(0, num_blocks):
-            rnd_cols = random.sample(range(0, num_blocks), self.num_random_blocks)
+            sample_range = range(
+                0,
+                num_blocks) if self.attention == 'bidirectional' else range(0,
+                                                                            row + 1)
+            rnd_cols = random.sample(sample_range, self.num_random_blocks)
             layout[h, row, rnd_cols] = 1
         return layout
 
@@ -519,6 +530,10 @@ class BigBirdSparsityConfig(SparsityConfig):
         #global columns
         layout[h, :, 0:self.num_global_blocks] = 1
 
+        if self.attention == 'unidirectional':
+            # zero out anything attending to the future
+            layout = torch.tril(layout)
+
         return layout
 
     def make_layout(self, seq_len):
@@ -555,7 +570,8 @@ class BSLongformerSparsityConfig(SparsityConfig):
                  different_layout_per_head=False,
                  num_sliding_window_blocks=3,
                  global_block_indices=[0],
-                 global_block_end_indices=None):
+                 global_block_end_indices=None,
+                 attention='bidirectional'):
         """Initialize the edited `Longformer` Sparsity Pattern Config.
 
         For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial
@@ -568,12 +584,14 @@ class BSLongformerSparsityConfig(SparsityConfig):
              num_sliding_window_blocks: optional: an integer determining the number of blocks in sliding local attention window.
              global_block_indices: optional: a list of integers determining which blocks are considered as global attention. Given indices, determine the blocks that all other token blocks attend to and they attend to all other token blocks. Default value is only index 0. Notice that if global_block_end_indices parameter is set, this parameter is used as starting index of each global window.
              global_block_end_indices: optional: a list of integers determining end indices of global window blocks. By default this is not used. But if it is set, it must have the same size of global_block_indices parameter, and combining this two parameters, for each index i, blocks from global_block_indices[i] to global_block_end_indices[i] (exclusive) are considered as global attention.
+             attention: optional: a string determining attention type. Attention can be `unidirectional`, such as autoregressive models, in which tokens attend only to tokens appear before them in the context. Considering that, the upper triangular of attention matrix is empty as above figure. Or it can be `bidirectional`, such as BERT, in which tokens can attend to any other tokens before or after them. Then, the upper triangular part of the attention matrix is mirror of the lower triangular in the above figure.
         """
 
         super().__init__(num_heads, block, different_layout_per_head)
 
         self.num_sliding_window_blocks = num_sliding_window_blocks
         self.global_block_indices = global_block_indices
+        self.attention = attention
 
         if (global_block_end_indices is not None):
             if (len(global_block_indices) != len(global_block_end_indices)):
@@ -642,6 +660,8 @@ class BSLongformerSparsityConfig(SparsityConfig):
 
                     #global columns
                     layout[h, :, start_idx:end_idx] = 1
+        if self.attention == 'unidirectional':
+            layout = torch.tril(layout)
         return layout
 
     def make_layout(self, seq_len):
@@ -661,3 +681,63 @@ class BSLongformerSparsityConfig(SparsityConfig):
 
         layout = self.check_and_propagate_first_head_layout(layout)
         return layout
+
+
+class LocalSlidingWindowSparsityConfig(SparsityConfig):
+    """Configuration class to store `Local Sliding Window` sparsity configuration - a purely-local sliding window attention.
+    This class extends parent class of `SparsityConfig` and customizes it for `Local` sparsity.
+    """
+    def __init__(self,
+                 num_heads,
+                 block=16,
+                 num_sliding_window_blocks=3,
+                 attention='unidirectional'):
+        """Initialize the Local Sliding Window Sparsity Pattern Config.
+        For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial
+        Arguments:
+             num_heads: required: an integer determining number of attention heads of the layer.
+             block: optional: an integer determining the block size. Current implementation of sparse self-attention is based on blocked sparse matrices. In which this parameter defines size of such blocks, `Block X Block`.
+             num_sliding_window_blocks: optional: an integer determining the number of blocks in sliding local attention window.
+	     attention: optional: a string determining attention type. Attention can be `unidirectional`, such as autoregressive models, in which tokens attend only to tokens appear before them in the context. Considering that, the upper triangular of attention matrix is empty as above figure. Or it can be `bidirectional`, such as BERT, in which tokens can attend to any other tokens before or after them. Then, the upper triangular part of the attention matrix is mirror of the lower triangular in the above figure.
+        """
+
+        super().__init__(num_heads, block)
+        self.num_sliding_window_blocks = num_sliding_window_blocks
+        self.attention = attention
+
+    def set_sliding_window_layout(self, h, layout):
+        """Sets sliding local attention layout used by the given head in the sparse attention.
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which local sliding window layout is set
+        """
+
+        num_blocks = layout.shape[1]
+        if (num_blocks < self.num_sliding_window_blocks):
+            raise ValueError(
+                f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!'
+            )
+
+        w = self.num_sliding_window_blocks // 2
+        for row in range(0, num_blocks):
+            start = max(0, row - w)
+            end = min(row + w + 1,
+                      num_blocks) if self.attention == "bidirectional" else row + 1
+            layout[h, row, start:end] = 1
+        return layout
+
+    def make_layout(self, seq_len):
+        """Generates `Local Sliding Window` sparsity layout used by each head in the sparse attention.
+        Arguments:
+             seq_len: required: an integer determining number of attention heads of the layer.
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing `BigBird` sparsity layout of all head
+        """
+
+        layout = self.setup_layout(seq_len)
+        for h in range(0, self.num_layout_heads):
+            layout = self.set_sliding_window_layout(h, layout)
+        layout = self.check_and_propagate_first_head_layout(layout)
+        return layout
diff --git a/deepspeed/ops/sparse_attention/trsrc/__init__.py b/deepspeed/ops/sparse_attention/trsrc/__init__.py
index 765d34f574a2d8960332e776bb6a860be5297b54..b21068e98f8e06b5188de0dc3ed8985003abb39f 100644
--- a/deepspeed/ops/sparse_attention/trsrc/__init__.py
+++ b/deepspeed/ops/sparse_attention/trsrc/__init__.py
@@ -1,3 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import sys
 import os
 
diff --git a/deepspeed/ops/transformer/__init__.py b/deepspeed/ops/transformer/__init__.py
old mode 100644
new mode 100755
index 28c8de68dd8b7999e1b141215f9e0de66a3bcf06..77d666c869a977ee73fbdc33185dccb6565299d4
--- a/deepspeed/ops/transformer/__init__.py
+++ b/deepspeed/ops/transformer/__init__.py
@@ -1,3 +1,6 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from .transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
-from .inference.transformer_inference import DeepSpeedTransformerInference, DeepSpeedInferenceConfig
+from .inference.config import DeepSpeedInferenceConfig
+from ...model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
 from .inference.moe_inference import DeepSpeedMoEInferenceConfig, DeepSpeedMoEInference
diff --git a/deepspeed/ops/transformer/inference/__init__.py b/deepspeed/ops/transformer/inference/__init__.py
index 66f8124015cbba8afec7f69916659b32b20985e9..6e9ca0051e789d3d48573a4d246573caad1ea2f1 100644
--- a/deepspeed/ops/transformer/inference/__init__.py
+++ b/deepspeed/ops/transformer/inference/__init__.py
@@ -1,2 +1,5 @@
-from .transformer_inference import DeepSpeedTransformerInference, DeepSpeedInferenceConfig
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .config import DeepSpeedInferenceConfig
+from ....model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
 from .moe_inference import DeepSpeedMoEInferenceConfig, DeepSpeedMoEInference
diff --git a/deepspeed/ops/transformer/inference/bias_add.py b/deepspeed/ops/transformer/inference/bias_add.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7d2944fb5f8470198c0ab0cb7d84edf1f342023
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/bias_add.py
@@ -0,0 +1,28 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+from typing import Optional
+import torch
+from deepspeed.ops.op_builder import SpatialInferenceBuilder
+
+spatial_cuda_module = None
+
+
+def nhwc_bias_add(activation: torch.Tensor,
+                  bias: torch.Tensor,
+                  other: Optional[torch.Tensor] = None,
+                  other_bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    global spatial_cuda_module
+    if spatial_cuda_module is None:
+        spatial_cuda_module = SpatialInferenceBuilder().load()
+
+    if other is None:
+        return spatial_cuda_module.nhwc_bias_add(activation, bias)
+    elif other_bias is None:
+        return spatial_cuda_module.nhwc_bias_add_add(activation, bias, other)
+    else:
+        return spatial_cuda_module.nhwc_bias_add_bias_add(activation,
+                                                          bias,
+                                                          other,
+                                                          other_bias)
diff --git a/deepspeed/ops/transformer/inference/config.py b/deepspeed/ops/transformer/inference/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0c87ba5619bf65f188a91f75f1676d7b49e2ce7
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/config.py
@@ -0,0 +1,115 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+import json
+from deepspeed.utils.types import ActivationFuncType
+
+
+class TransformerConfig():
+    def __init__(self, hidden_size, intermediate_size, heads, num_hidden_layers):
+        self.layer_id = -1
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.heads = heads
+        self.num_hidden_layers = num_hidden_layers
+
+
+class DeepSpeedInferenceConfig(TransformerConfig):
+    """Initialize the DeepSpeed Transformer Config.
+        Arguments:
+            hidden_size: The hidden size of the transformer layer
+            intermediate_size: The intermediate size of the feed-forward part of transformer layer
+            heads: The number of heads in the self-attention of the transformer layer
+            num_hidden_layers: The number of transformer layers
+            layer_norm_eps: The epsilon value for the layer norm
+            local_rank: Optional: The rank of GPU running the transformer kernel, it is not required
+                to use if the model already set the current device, otherwise need to set it
+                so that the transformer kernel can work on the right device
+            mp_size (optional): This argument is mainly used to create the parameters on the kernel side
+                using model-parallel architecture. If the client model already takes care of this, there is no
+                need to pass this argument.
+            fp16: Enable half-precision computation
+            pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture
+            stochastic_mode:  Enable for high performance, please note that this flag has some level of
+                non-determinism and can produce different results on different runs.  However, we have seen
+                that by enabling it, the pretraining tasks such as BERT are not affected and can obtain
+                a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend
+                to turn it off in order to be able to reproduce the same result through the regular kernel execution.
+
+            scale_attention: If true, both q and k are scaled by 1/sqrt(attention_heads) before attention computation.
+            return_tuple: if True, returns the transformer output as a tuple, otherwise returns as a tensor
+            bigscience_bloom: This flag is added temporarily for supporting the BLOOM-176B model architecture.
+    """
+    def __init__(self,
+                 hidden_size=-1,
+                 intermediate_size=-1,
+                 heads=-1,
+                 num_hidden_layers=-1,
+                 layer_norm_eps=1e-12,
+                 local_rank=-1,
+                 mp_size=1,
+                 fp16=False,
+                 q_int8=False,
+                 pre_layer_norm=True,
+                 stochastic_mode=False,
+                 scale_attention=True,
+                 triangular_masking=True,
+                 local_attention=False,
+                 window_size=256,
+                 rotary_dim=-1,
+                 rotate_half=False,
+                 rotate_every_two=True,
+                 return_tuple=True,
+                 mlp_after_attn=True,
+                 mlp_act_func_type=ActivationFuncType.GELU,
+                 training_mp_size=1,
+                 bigscience_bloom=False,
+                 max_out_tokens=1024,
+                 enable_qkv_quantization=False,
+                 use_mup=False,
+                 scale_attn_by_inverse_layer_idx=False,
+                 return_single_tuple=False):
+        super(DeepSpeedInferenceConfig,
+              self).__init__(
+                  hidden_size,
+                  (intermediate_size if intermediate_size > 0 else 4 * hidden_size),
+                  heads,
+                  num_hidden_layers)
+        self.fp16 = fp16
+        self.pre_layer_norm = pre_layer_norm
+        self.local_rank = local_rank
+        self.stochastic_mode = stochastic_mode
+        self.epsilon = layer_norm_eps
+        self.mp_size = mp_size
+        self.q_int8 = q_int8
+        self.scale_attention = scale_attention
+        self.triangular_masking = triangular_masking
+        self.local_attention = local_attention
+        self.window_size = window_size
+        self.rotary_dim = rotary_dim
+        self.rotate_half = rotate_half
+        self.rotate_every_two = rotate_every_two
+        self.return_tuple = return_tuple
+        self.mlp_after_attn = mlp_after_attn
+        self.mlp_act_func_type = mlp_act_func_type
+        self.specialized_mode = False
+        self.training_mp_size = training_mp_size
+        self.bigscience_bloom = bigscience_bloom
+        self.max_out_tokens = max_out_tokens
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+        self.enable_qkv_quantization = enable_qkv_quantization
+        self.use_mup = use_mup
+        self.return_single_tuple = return_single_tuple
+
+    @classmethod
+    def from_dict(cls, json_object):
+        config = DeepSpeedInferenceConfig()
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
diff --git a/deepspeed/ops/transformer/inference/diffusers_2d_transformer.py b/deepspeed/ops/transformer/inference/diffusers_2d_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..246eab8676fe4e1a3bab1c05adefec06316e632e
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/diffusers_2d_transformer.py
@@ -0,0 +1,8 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+
+class Diffusers2DTransformerConfig():
+    def __init__(self, int8_quantization=False):
+        self.int8_quantization = int8_quantization
diff --git a/deepspeed/ops/transformer/inference/diffusers_attention.py b/deepspeed/ops/transformer/inference/diffusers_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d829ce3e0729d70e7a1a87a0012a844700f2992
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/diffusers_attention.py
@@ -0,0 +1,248 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+import math
+import torch
+from torch.autograd import Function
+import torch.nn as nn
+from packaging import version as pkg_version
+from deepspeed.utils.logging import log_dist
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import InferenceBuilder
+
+# Cuda modules will be imported if needed
+inference_cuda_module = None
+minus_inf = -10000.0
+triton_flash_attn = None
+
+
+def load_triton_flash_attn():
+    global triton_flash_attn
+    try:
+        import triton
+    except ImportError:
+        raise ImportError("Please install triton 2.0+ or `pip install deepspeed[sd]`")
+
+    if pkg_version.parse(triton.__version__) < pkg_version.parse("2.0"):
+        raise ImportError("Please install triton 2.0+ or `pip install deepspeed[sd]`")
+
+    from .triton_ops import triton_flash_attn
+
+
+class DeepSpeedDiffusersAttentionFunction(Function):
+    @staticmethod
+    def forward(ctx,
+                input,
+                context,
+                input_mask,
+                config,
+                attn_qkvw,
+                attn_qw,
+                attn_kw,
+                attn_vw,
+                attn_qkvb,
+                num_attention_heads_per_partition,
+                norm_factor,
+                hidden_size_per_partition,
+                attn_ow,
+                attn_ob,
+                do_out_bias,
+                score_context_func,
+                linear_func,
+                triton_flash_attn_kernel):
+        def _transpose_for_context(x):
+            x = x.permute(0, 2, 1, 3)
+            new_x_layer_shape = x.size()[:-2] + \
+                                      (hidden_size_per_partition,)
+            return x.reshape(*new_x_layer_shape)
+
+        def _transpose_for_scores(x):
+            attention_head_size = x.shape[-1] // num_attention_heads_per_partition
+            new_x_shape = x.size()[:-1] + (num_attention_heads_per_partition,
+                                           attention_head_size)
+            x = x.reshape(*new_x_shape)
+            x = x.permute(0, 2, 1, 3)
+            return x.contiguous()
+
+        def selfAttention_fp(input, context, input_mask):
+            if config.fp16 and input.dtype == torch.float32:
+                input = input.half()
+            head_size = input.shape[-1] // config.heads
+            do_flash_attn = (head_size <= 128)
+            scale = (1 / norm_factor) * (1 / norm_factor)
+            if do_flash_attn and context == None:
+                qkv_out = linear_func(input,
+                                      attn_qkvw,
+                                      attn_qkvb if attn_qkvb is not None else attn_qkvw,
+                                      attn_qkvb is not None,
+                                      do_flash_attn,
+                                      config.heads)
+
+                context_layer = triton_flash_attn_kernel(qkv_out[0],
+                                                         qkv_out[1],
+                                                         qkv_out[2],
+                                                         scale,
+                                                         input.shape[-2] % 128 == 0)
+                context_layer = _transpose_for_context(context_layer[:,:,:,:head_size])
+
+            else:
+                do_flash_attn = False
+                if context is not None:
+                    query = torch.matmul(input, attn_qw)
+                    key = torch.matmul(context, attn_kw)
+                    value = torch.matmul(context, attn_vw)
+                else:
+                    qkv = torch.matmul(input, attn_qkvw)
+                    query, key, value = qkv.chunk(3, dim=-1)
+                    query = query.contiguous()
+                    key = key.contiguous()
+                    value = value.contiguous()
+                query, key, value = inference_cuda_module.pad_transform_fp16(query, key, value, config.heads, do_flash_attn)
+                attention_scores = (torch.matmul(query,
+                                                 key.transpose(-1,
+                                                               -2)) *
+                                    scale).softmax(dim=-1)
+                context_layer = _transpose_for_context(
+                    torch.matmul(attention_scores,
+                                 value))
+
+            output = linear_func(context_layer,
+                                 attn_ow,
+                                 attn_ob,
+                                 do_out_bias,
+                                 False,
+                                 config.heads)
+            return output
+
+        output = selfAttention_fp(input, context, input_mask)
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output, grad_output1, grad_output2, grad_output3):
+        raise RuntimeError('You are running with DeepSpeed Inference mode. \
+                            Please switch to Training mode for running backward!')
+
+
+class DeepSpeedDiffusersAttention(nn.Module):
+    """Initialize the DeepSpeed Transformer Layer.
+        Arguments:
+            layer_id: The layer index starting from 0, e.g. if model has 24 transformer layers,
+                layer_id will be 0,1,2...23 when each layer object is instantiated
+            config: An object of DeepSpeedInferenceConfig
+    """
+    layer_id = 0
+
+    def __init__(
+        self,
+        config,
+    ):
+        super(DeepSpeedDiffusersAttention, self).__init__()
+
+        self.config = config
+        self.config.layer_id = DeepSpeedDiffusersAttention.layer_id
+        DeepSpeedDiffusersAttention.layer_id += 1
+        device = get_accelerator().current_device_name(
+        ) if config.bigscience_bloom else 'cpu'
+        qkv_size_per_partition = (self.config.hidden_size // self.config.mp_size) * 3
+
+        data_type = torch.int8 if config.q_int8 else torch.half if config.fp16 else torch.float
+        data_type_fp = torch.half if config.fp16 else torch.float
+        global inference_cuda_module
+        if inference_cuda_module is None:
+            builder = InferenceBuilder()
+            inference_cuda_module = builder.load()
+
+        if DeepSpeedDiffusersAttention.layer_id == 1:
+            log_dist(f"DeepSpeed-Attention config: {self.config.__dict__}", [0])
+
+        self.attn_qkvw = nn.Parameter(torch.empty(self.config.hidden_size,
+                                                  qkv_size_per_partition,
+                                                  dtype=data_type,
+                                                  device=device),
+                                      requires_grad=False)
+        self.attn_kw = nn.Parameter(torch.empty(self.config.hidden_size,
+                                                self.config.hidden_size,
+                                                dtype=data_type,
+                                                device=device),
+                                    requires_grad=False)
+        self.attn_vw = nn.Parameter(torch.empty(self.config.hidden_size,
+                                                self.config.hidden_size,
+                                                dtype=data_type,
+                                                device=device),
+                                    requires_grad=False)
+        self.attn_qw = nn.Parameter(torch.empty(self.config.hidden_size,
+                                                self.config.hidden_size,
+                                                dtype=data_type,
+                                                device=device),
+                                    requires_grad=False)
+        self.attn_qkvb = nn.Parameter(torch.empty(qkv_size_per_partition,
+                                                  dtype=data_type_fp,
+                                                  device=device),
+                                      requires_grad=False)
+        out_size_per_partition = self.config.hidden_size // self.config.mp_size
+        self.attn_ow = nn.Parameter(torch.empty(out_size_per_partition,
+                                                self.config.hidden_size,
+                                                dtype=data_type,
+                                                device=device),
+                                    requires_grad=False)
+
+        self.attn_ob = nn.Parameter(torch.empty(self.config.hidden_size,
+                                                dtype=data_type_fp,
+                                                device=device),
+                                    requires_grad=False)
+        self.do_out_bias = True
+
+        if triton_flash_attn is None:
+            load_triton_flash_attn()
+        self.triton_flash_attn_kernel = triton_flash_attn()
+        self.num_attention_heads_per_partition = self.config.heads // self.config.mp_size
+        self.hidden_size_per_partition = self.config.hidden_size // self.config.mp_size
+        self.hidden_size_per_attention_head = self.config.hidden_size // self.config.heads
+
+        self.norm_factor = math.sqrt(
+            math.sqrt(self.config.hidden_size // self.config.heads))
+
+        if self.config.scale_attn_by_inverse_layer_idx is True:
+            self.norm_factor *= math.sqrt(self.config.layer_id + 1)
+            # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/gpt2/modeling_gpt2.py#L191
+
+        self.score_context_func = inference_cuda_module.softmax_context_fp32 if (not config.fp16) else \
+                                    inference_cuda_module.softmax_context_fp16
+        self.linear_func = inference_cuda_module.linear_layer_fp16 if config.fp16 else \
+                                    inference_cuda_module.linear_layer_fp32
+        self.allocate_workspace = inference_cuda_module.allocate_workspace_fp32 if not (config.fp16) else \
+                                    inference_cuda_module.allocate_workspace_fp16
+
+    def forward(self, input, context=None, input_mask=None):
+        if self.config.layer_id == 0:
+            self.allocate_workspace(self.config.hidden_size,
+                                    self.config.heads,
+                                    input.size()[1],
+                                    input.size()[0],
+                                    DeepSpeedDiffusersAttention.layer_id,
+                                    self.config.mp_size,
+                                    False,
+                                    0,
+                                    self.config.max_out_tokens)
+        output = DeepSpeedDiffusersAttentionFunction.apply(
+            input,
+            context,
+            input_mask,
+            self.config,
+            self.attn_qkvw,
+            self.attn_qw,
+            self.attn_kw,
+            self.attn_vw,
+            self.attn_qkvb,
+            self.num_attention_heads_per_partition,
+            self.norm_factor,
+            self.hidden_size_per_partition,
+            self.attn_ow,
+            self.attn_ob,
+            self.do_out_bias,
+            self.score_context_func,
+            self.linear_func,
+            self.triton_flash_attn_kernel)
+
+        return output
diff --git a/deepspeed/ops/transformer/inference/diffusers_transformer_block.py b/deepspeed/ops/transformer/inference/diffusers_transformer_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..e453c343e9f04119a7a508f8545249c3285f1aec
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/diffusers_transformer_block.py
@@ -0,0 +1,126 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+import torch
+import torch.nn as nn
+
+from deepspeed import module_inject
+from .diffusers_attention import DeepSpeedDiffusersAttention
+from .bias_add import nhwc_bias_add
+from .diffusers_2d_transformer import Diffusers2DTransformerConfig
+from deepspeed.ops.op_builder import InferenceBuilder, SpatialInferenceBuilder
+
+# Ops will be loaded on demand
+transformer_cuda_module = None
+spatial_cuda_module = None
+
+
+def load_transformer_module():
+    global transformer_cuda_module
+    if transformer_cuda_module is None:
+        transformer_cuda_module = InferenceBuilder().load()
+    return transformer_cuda_module
+
+
+def load_spatial_module():
+    global spatial_cuda_module
+    if spatial_cuda_module is None:
+        spatial_cuda_module = SpatialInferenceBuilder().load()
+    return spatial_cuda_module
+
+
+class DeepSpeedDiffusersTransformerBlock(nn.Module):
+    def __init__(self,
+                 equivalent_module: nn.Module,
+                 config: Diffusers2DTransformerConfig):
+        super(DeepSpeedDiffusersTransformerBlock, self).__init__()
+        self.quantizer = module_inject.GroupQuantizer(q_int8=config.int8_quantization)
+        # Ensure ops are built by the time we start running
+        self.config = config
+
+        self.ff1_w = self.quantizer.quantize(
+            nn.Parameter(equivalent_module.ff.net[0].proj.weight.data,
+                         requires_grad=False))
+        self.ff1_b = nn.Parameter(equivalent_module.ff.net[0].proj.bias.data,
+                                  requires_grad=False)
+        self.ff2_w = self.quantizer.quantize(
+            nn.Parameter(equivalent_module.ff.net[2].weight.data,
+                         requires_grad=False))
+        self.ff2_b = nn.Parameter(equivalent_module.ff.net[2].bias.data,
+                                  requires_grad=False)
+
+        self.norm1_g = nn.Parameter(equivalent_module.norm1.weight.data,
+                                    requires_grad=False)
+        self.norm1_b = nn.Parameter(equivalent_module.norm1.bias.data,
+                                    requires_grad=False)
+        self.norm1_eps = equivalent_module.norm1.eps
+
+        self.norm2_g = nn.Parameter(equivalent_module.norm2.weight.data,
+                                    requires_grad=False)
+        self.norm2_b = nn.Parameter(equivalent_module.norm2.bias.data,
+                                    requires_grad=False)
+        self.norm2_eps = equivalent_module.norm2.eps
+
+        self.norm3_g = nn.Parameter(equivalent_module.norm3.weight.data,
+                                    requires_grad=False)
+        self.norm3_b = nn.Parameter(equivalent_module.norm3.bias.data,
+                                    requires_grad=False)
+        self.norm3_eps = equivalent_module.norm3.eps
+
+        self.attn_1 = equivalent_module.attn1
+        self.attn_2 = equivalent_module.attn2
+
+        # Pull the bias in if we can
+        if isinstance(self.attn_1, DeepSpeedDiffusersAttention):
+            self.attn_1.do_out_bias = False
+            self.attn_1_bias = self.attn_1.attn_ob
+        else:
+            self.attn_1_bias = nn.Parameter(torch.zeros_like(self.norm2_g),
+                                            requires_grad=False)
+
+        # Pull the bias in if we can
+        if isinstance(self.attn_2, DeepSpeedDiffusersAttention):
+            self.attn_2.do_out_bias = False
+            self.attn_2_bias = self.attn_2.attn_ob
+        else:
+            self.attn_2_bias = nn.Paramaeter(torch.zeros_like(self.norm3_g),
+                                             requires_grad=False)
+
+        self.transformer_cuda_module = load_transformer_module()
+        load_spatial_module()
+
+    def forward(self, hidden_states, context=None, timestep=None, **kwargs):
+        # In v0.12.0 of diffuser, several new kwargs were added. Capturing
+        # those with kwargs to maintain backward compatibility
+
+        # In v0.11.0 of diffusers, the kwarg was changed from 'context' to 'encoder_hidden_states'
+        # This is so we can support older and newer versions of diffusers
+        if "encoder_hidden_states" in kwargs and kwargs["encoder_hidden_states"] != None:
+            context = kwargs["encoder_hidden_states"]
+
+        out_norm_1 = self.transformer_cuda_module.layer_norm(hidden_states,
+                                                             self.norm1_g,
+                                                             self.norm1_b,
+                                                             self.norm1_eps)
+        out_attn_1 = self.attn_1(out_norm_1)
+
+        out_norm_2, out_attn_1 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res(out_attn_1,
+                                                                 self.attn_1_bias,
+                                                                 hidden_states,
+                                                                 self.norm2_g,
+                                                                 self.norm2_b,
+                                                                 self.norm2_eps)
+        out_attn_2 = self.attn_2(out_norm_2, context=context)
+        out_norm_3, out_attn_2 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res(out_attn_2,
+                                                                 self.attn_2_bias,
+                                                                 out_attn_1,
+                                                                 self.norm3_g,
+                                                                 self.norm3_b,
+                                                                 self.norm3_eps)
+
+        out_ff1 = nn.functional.linear(out_norm_3, self.ff1_w)
+        out_geglu = self.transformer_cuda_module.bias_geglu(out_ff1, self.ff1_b)
+
+        out_ff2 = nn.functional.linear(out_geglu, self.ff2_w)
+        return nhwc_bias_add(out_ff2, self.ff2_b, other=out_attn_2)
diff --git a/deepspeed/ops/transformer/inference/ds_attention.py b/deepspeed/ops/transformer/inference/ds_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4ec14bf49906916e606e73f2c8d00a39971e730
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/ds_attention.py
@@ -0,0 +1,277 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+import math
+import torch
+import torch.nn as nn
+from deepspeed import comm as dist
+from deepspeed.accelerator import get_accelerator
+from .op_binding import LinearOp, VectorMatMulOp, SoftmaxContextOp, QKVGemmOp, SoftmaxOp
+
+minus_inf = -10000.0
+
+
+class DeepSpeedSelfAttention(nn.Module):
+    num_layers = 0
+
+    def __init__(self, config, mp_group=None, q_scales=None, q_groups=1, merge_count=1):
+        super(DeepSpeedSelfAttention, self).__init__()
+        self.config = config
+        data_type = torch.int8 if config.q_int8 else torch.half if config.fp16 else torch.float
+        data_type_fp = torch.half if config.fp16 else torch.float
+        self.config.layer_id = DeepSpeedSelfAttention.num_layers
+        DeepSpeedSelfAttention.num_layers = DeepSpeedSelfAttention.num_layers + 1
+        device = get_accelerator().current_device_name(
+        )  #if config.bigscience_bloom else 'cpu'
+        qkv_size_per_partition = (self.config.hidden_size // self.config.mp_size) * 3
+        self.attn_qkvw = nn.Parameter(torch.empty(self.config.hidden_size,
+                                                  qkv_size_per_partition,
+                                                  dtype=data_type,
+                                                  device=device),
+                                      requires_grad=False)
+        self.attn_qkvb = nn.Parameter(torch.empty(qkv_size_per_partition,
+                                                  dtype=data_type_fp,
+                                                  device=device),
+                                      requires_grad=False)
+        out_size_per_partition = self.config.hidden_size // self.config.mp_size
+        self.attn_ow = nn.Parameter(torch.empty(out_size_per_partition,
+                                                self.config.hidden_size,
+                                                dtype=data_type,
+                                                device=device),
+                                    requires_grad=False)
+
+        self.attn_ob = nn.Parameter(torch.empty(self.config.hidden_size,
+                                                dtype=data_type_fp,
+                                                device=device),
+                                    requires_grad=False)
+
+        self.num_attention_heads_per_partition = self.config.heads // self.config.mp_size
+        self.hidden_size_per_partition = self.config.hidden_size // self.config.mp_size
+        self.hidden_size_per_attention_head = self.config.hidden_size // self.config.heads
+
+        self.mp_group = mp_group
+
+        # used for quantization
+        self.q_scales = q_scales
+        self.q_groups = q_groups
+        self.merge_count = int(math.log2(merge_count))
+
+        self.norm_factor = math.sqrt(self.config.hidden_size // self.config.heads)
+        if not config.use_mup:
+            self.norm_factor = math.sqrt(self.norm_factor)
+
+        if self.config.scale_attn_by_inverse_layer_idx is True:
+            self.norm_factor *= math.sqrt(self.config.layer_id + 1)
+            # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/gpt2/modeling_gpt2.py#L191
+
+        self.qkv_func = QKVGemmOp(config)
+        self.score_context_func = SoftmaxContextOp(config)
+        self.linear_func = LinearOp(config)
+        self.vector_matmul_func = VectorMatMulOp(config)
+
+    def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
+        if isinstance(qkv_out, list):
+            qkv_out = qkv_out[0]
+
+        no_masking = input_mask is None
+
+        if no_masking:
+            input_mask = torch.empty(1)
+
+        attn_key_value = self.score_context_func(
+            query_key_value=qkv_out,
+            attn_mask=((1 - input_mask).to(qkv_out.dtype) *
+                       minus_inf) if input_mask.dtype == torch.int64 else input_mask,
+            heads=self.num_attention_heads_per_partition,
+            norm_factor=(1 / self.norm_factor if self.config.scale_attention else 1.0),
+            no_masking=no_masking,
+            layer_id=self.config.layer_id,
+            num_layers=DeepSpeedSelfAttention.num_layers,
+            alibi=alibi)
+
+        context_layer, key_layer, value_layer = attn_key_value
+        return context_layer, key_layer, value_layer
+
+    def forward(self,
+                input,
+                input_mask,
+                head_mask=None,
+                layer_past=None,
+                get_present=False,
+                encoder_hidden_states=None,
+                encoder_attention_mask=None,
+                output_attentions=False,
+                norm_w=None,
+                norm_b=None,
+                alibi=None):
+
+        if not self.config.pre_layer_norm:
+            qkv_out = self.linear_func(input=input,
+                                       weight=self.attn_qkvw,
+                                       bias=self.attn_qkvb,
+                                       add_bias=self.attn_qkvb is not None,
+                                       do_flash_attn=False,
+                                       num_heads=self.num_attention_heads_per_partition,
+                                       num_layers=DeepSpeedSelfAttention.num_layers)
+        else:
+            qkv_out = self.qkv_func(
+                input=input,
+                weight=self.attn_qkvw,
+                bias=(self.attn_qkvb if self.attn_qkvb is not None else norm_b),
+                gamma=norm_w,
+                beta=norm_b,
+                add_bias=(self.attn_qkvb is not None),
+                num_layers=DeepSpeedSelfAttention.num_layers,
+                num_heads=self.num_attention_heads_per_partition)
+
+        context_layer, key_layer, value_layer = self.compute_attention(
+            qkv_out=qkv_out,
+            input_mask=input_mask,
+            layer_past=layer_past,
+            alibi=alibi)
+
+        output = self.vector_matmul_func(input=context_layer, weight=self.attn_ow)
+
+        inp_norm = qkv_out[-1]
+
+        if self.config.mlp_after_attn and self.mp_group is not None and dist.get_world_size(
+                group=self.mp_group) > 1:
+            dist.all_reduce(output, group=self.mp_group)
+
+        return (output, key_layer, value_layer, context_layer, inp_norm)
+
+
+class BloomSelfAttention(DeepSpeedSelfAttention):
+    def __init__(self, *args, **kwargs):
+        super(BloomSelfAttention, self).__init__(*args, **kwargs)
+        self.softmax_func = SoftmaxOp(self.config)
+
+    ########### This part is taken/modified form the HF modeling_bloom.py ################
+    # Reference: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py
+
+    def _transpose_for_context(self, x):
+        x = x.permute(0, 2, 1, 3).contiguous()
+        new_x_layer_shape = x.size()[:-2] + \
+                                    (self.hidden_size_per_partition,)
+        return x.view(*new_x_layer_shape).contiguous()
+
+    def _split_tensor_along_last_dim(self,
+                                     tensor,
+                                     num_partitions,
+                                     contiguous_split_chunks=True):
+        """Split a tensor along its last dimension.
+
+        Args:
+            tensor: ([`torch.tensor`], *required*):
+                input tensor to split
+            num_partitions ([`int`], *required*):
+                number of partitions to split the tensor
+            contiguous_split_chunks ([`bool`], *optional*, default=`False`)::
+                If True, make each chunk contiguous in memory.
+        """
+        # Get the size and dimension.
+        last_dim = tensor.dim() - 1
+        numerator, denominator = tensor.size()[last_dim], num_partitions
+        if not (numerator % denominator == 0):
+            raise ValueError(f"{numerator} is not divisible by {denominator}")
+        last_dim_size = numerator // denominator
+        # Split.
+        tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+        # Note: torch.split does not create contiguous tensors by default.
+        if contiguous_split_chunks:
+            return tuple(chunk.contiguous() for chunk in tensor_list)
+
+        return tensor_list
+
+    def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
+        if isinstance(qkv_out, list):
+            qkv_out = qkv_out[0]
+
+        no_masking = input_mask is None
+
+        if no_masking:
+            input_mask = torch.empty(1)
+
+        mixed_x_layer = qkv_out
+        alibi = alibi.to(get_accelerator().current_device_name())
+        head_dim = self.hidden_size_per_partition // self.num_attention_heads_per_partition
+        new_tensor_shape = mixed_x_layer.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            3 * head_dim)
+        mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+        query_layer, key_layer, value_layer = self._split_tensor_along_last_dim(mixed_x_layer, 3)
+
+        # [batch_size, head_dim, q_length, k_length]
+        output_size = (query_layer.size(0),
+                       query_layer.size(2),
+                       query_layer.size(1),
+                       key_layer.size(1))
+        # [batch_size, q_length, num_heads, head_dim] -> [q_length, batch_size * num_heads, head_dim]
+        query_layer = query_layer.transpose(1,
+                                            2).reshape(output_size[0] * output_size[1],
+                                                       output_size[2],
+                                                       -1)
+        # [batch_size, k_length, num_heads, head_dim] -> [k_length, batch_size * num_heads, head_dim]
+        key_layer = key_layer.transpose(1,
+                                        2).reshape(output_size[0] * output_size[1],
+                                                   output_size[3],
+                                                   -1).transpose(-1,
+                                                                 -2)
+        value_layer = value_layer.transpose(1,
+                                            2).reshape(output_size[0] * output_size[1],
+                                                       output_size[3],
+                                                       -1)
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            # concatenate along seq_length dimension -> [batch_size, qk_length, num_heads, head_dim]
+            key_layer = torch.cat((past_key.type_as(key_layer), key_layer), dim=-1)
+            value_layer = torch.cat((past_value.type_as(value_layer),
+                                     value_layer),
+                                    dim=-2)
+
+        presents = (key_layer, value_layer)
+        # Raw attention scores. [batch_size * num_heads, q_length, k_length]
+        matmul_result = torch.matmul(query_layer, key_layer)
+        # change view to [batch_size, num_heads, q_length, k_length]
+        attention_scores = matmul_result.view(output_size[0],
+                                              output_size[1],
+                                              output_size[2],
+                                              -1)
+
+        offset = dist.get_rank(
+        ) * self.num_attention_heads_per_partition if dist.is_initialized() else 0
+        attention_probs = self.softmax_func(
+            attn_scores=attention_scores,
+            attn_mask=((1 - input_mask).half() * minus_inf),
+            alibi=alibi,
+            triangular=(self.config.triangular_masking
+                        and (attention_scores.shape[-2] > 1)),
+            recompute=False,
+            local_attention=False,
+            window_size=1,
+            async_op=False,
+            layer_scale=1 / (self.norm_factor * self.norm_factor),
+            head_offset=offset)
+
+        # change view [batch_size x num_heads, q_length, k_length]
+        attention_probs_reshaped = attention_probs.view(*matmul_result.shape)
+
+        # matmul: [batch_size * num_heads, q_length, head_dim]
+        context_layer = torch.bmm(attention_probs_reshaped, value_layer)
+
+        # change view [batch_size, num_heads, q_length, head_dim]
+        context_layer = context_layer.view(
+            context_layer.size(0) // self.num_attention_heads_per_partition,
+            self.num_attention_heads_per_partition,
+            context_layer.size(1),
+            context_layer.shape[-1])
+
+        context_layer = self._transpose_for_context(context_layer)
+        key_layer = presents[0]
+        value_layer = presents[1]
+
+        return context_layer, key_layer, value_layer
+
+    ###################### End of HF modeling_bloom addition ########################
diff --git a/deepspeed/ops/transformer/inference/ds_mlp.py b/deepspeed/ops/transformer/inference/ds_mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..02d492d8e32b38ed82269fba7ce264d7328984be
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/ds_mlp.py
@@ -0,0 +1,94 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+import math
+import torch
+import torch.nn as nn
+from deepspeed import comm as dist
+from deepspeed.accelerator import get_accelerator
+from .op_binding import MLPGemmOp, VectorMatMulOp, GELUGemmOp, ResidualAddOp
+
+
+class DeepSpeedMLP(nn.Module):
+    def __init__(self,
+                 config,
+                 mp_group=None,
+                 q_scales=None,
+                 q_groups=1,
+                 merge_count=1,
+                 mlp_extra_grouping=False):
+        super(DeepSpeedMLP, self).__init__()
+
+        self.config = config
+        data_type = torch.int8 if config.q_int8 else torch.half if config.fp16 else torch.float
+        data_type_fp = torch.half if config.fp16 else torch.float
+        device = get_accelerator().current_device_name()
+        self.attn_nw = nn.Parameter(torch.empty(self.config.hidden_size,
+                                                dtype=data_type_fp,
+                                                device=device),
+                                    requires_grad=False)
+        self.attn_nb = nn.Parameter(torch.empty(self.config.hidden_size,
+                                                dtype=data_type_fp,
+                                                device=device),
+                                    requires_grad=False)
+        intm_size_per_partition = self.config.intermediate_size // self.config.mp_size
+        self.inter_w = nn.Parameter(torch.empty(self.config.hidden_size,
+                                                intm_size_per_partition,
+                                                dtype=data_type,
+                                                device=device),
+                                    requires_grad=False)
+        self.inter_b = nn.Parameter(torch.empty(intm_size_per_partition,
+                                                dtype=data_type_fp,
+                                                device=device),
+                                    requires_grad=False)
+        self.output_w = nn.Parameter(torch.empty(intm_size_per_partition,
+                                                 self.config.hidden_size,
+                                                 dtype=data_type,
+                                                 device=device),
+                                     requires_grad=False)
+        self.output_b = nn.Parameter(torch.empty(self.config.hidden_size,
+                                                 dtype=data_type_fp,
+                                                 device=device),
+                                     requires_grad=False)
+
+        # used for quantization
+        self.q_scales = q_scales
+        self.q_groups = q_groups * 2 if mlp_extra_grouping else q_groups
+        self.merge_count = int(math.log2(merge_count))
+        self.mp_group = mp_group
+
+        self.mlp_gemm_func = MLPGemmOp(config)
+        self.vector_matmul_func = VectorMatMulOp(config)
+        self.fused_gemm_gelu = GELUGemmOp(config)
+        self.residual_add_func = ResidualAddOp(config)
+
+    def forward(self, input, residual, residual_norm, bias):
+        residual_add = None
+        if self.attn_nw is None:
+            output = self.fused_gemm_gelu(input=residual_norm,
+                                          weight=self.inter_w,
+                                          bias=self.inter_b,
+                                          weight_out=self.output_w)
+        else:
+            output, residual_add = self.mlp_gemm_func(input=input,
+                                                      residual=residual,
+                                                      input_bias=bias,
+                                                      weight_interm=self.inter_w,
+                                                      weight_out=self.output_w,
+                                                      bias=self.inter_b,
+                                                      gamma=self.attn_nw,
+                                                      beta=self.attn_nb)
+        residual = self.residual_add_func(
+            hidden_state=output,
+            residual=residual,
+            attention_output=input,
+            attention_bias=bias if bias is not None else self.output_b,
+            final_bias=self.output_b,
+            add_bias=bias is not None,
+            residual_add=residual_add)
+
+        if self.mp_group is not None and dist.get_world_size(group=self.mp_group) > 1:
+            dist.all_reduce(residual, group=self.mp_group)
+
+        return residual
diff --git a/deepspeed/ops/transformer/inference/moe_inference.py b/deepspeed/ops/transformer/inference/moe_inference.py
index 855211baf57b15911f97ff5d28fb3c4efa6b3207..d5e45c7eb00949474a1faf54991999e4af104d16 100644
--- a/deepspeed/ops/transformer/inference/moe_inference.py
+++ b/deepspeed/ops/transformer/inference/moe_inference.py
@@ -3,22 +3,19 @@ Copyright 2020 The Microsoft DeepSpeed Team
 '''
 import json
 import math
-import importlib
 import torch
-from torch import nn
 from torch.autograd import Function
-import time
-from ... import op_builder
 #from ...inference.engine import inference_cuda_module, specialized_mode
 # Cuda modules will be imported if needed
 inference_cuda_module = None
 specialized_mode = None
 import torch.nn as nn
-from .transformer_inference import DeepSpeedSelfAttention, DeepSpeedInferenceConfig
+from .ds_attention import DeepSpeedSelfAttention
+from .config import DeepSpeedInferenceConfig
 from ....moe.sharded_moe import TopKGate
-import torch.distributed as dist
-
-import torch.nn.functional as F
+from deepspeed import comm as dist
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import InferenceBuilder
 
 
 class DeepSpeedMoEInferenceConfig(DeepSpeedInferenceConfig):
@@ -72,7 +69,8 @@ class DeepSpeedMoEInferenceConfig(DeepSpeedInferenceConfig):
                  noisy_gate_policy=None,
                  drop_tokens=True,
                  use_rts=False,
-                 mlp_type='standard'):
+                 mlp_type='standard',
+                 scale_attn_by_inverse_layer_idx=False):
         super(DeepSpeedMoEInferenceConfig,
               self).__init__(
                   hidden_size,
@@ -101,6 +99,7 @@ class DeepSpeedMoEInferenceConfig(DeepSpeedInferenceConfig):
         self.use_rts = use_rts
         self.global_experts = global_experts
         self.mlp_type = mlp_type
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
 
     @classmethod
     def from_dict(cls, json_object):
@@ -233,8 +232,7 @@ class DeepSpeedMoEInference(nn.Module):
                  quantize_scales=None,
                  quantize_groups=1,
                  merge_count=1,
-                 mlp_extra_grouping=False,
-                 qkv_merging=False):
+                 mlp_extra_grouping=False):
         super(DeepSpeedMoEInference, self).__init__()
 
         self.config = config
@@ -243,15 +241,13 @@ class DeepSpeedMoEInference(nn.Module):
         global specialized_mode
         if inference_cuda_module is None:
             specialized_mode = False
-            if hasattr(op_builder, 'InferenceSpecializedBuilder'):
-                builder = op_builder.InferenceSpecializedBuilder()
-                if builder.is_compatible():
-                    inference_cuda_module = builder.load()
-                    specialized_mode = True
-                else:
-                    inference_cuda_module = op_builder.InferenceBuilder().load()
+            # InferenceSpecializedBuilder is not among DeepSpeed provided builder yet, so we infer by builder name string
+            builder = get_accelerator().create_op_builder("InferenceSpecializedBuilder")
+            if builder != None and builder.is_compatible():
+                inference_cuda_module = builder.load()
+                specialized_mode = True
             else:
-                inference_cuda_module = op_builder.InferenceBuilder().load()
+                inference_cuda_module = InferenceBuilder().load()
         self.config.specialized_mode = specialized_mode
 
         DeepSpeedMoEInference.layer_id += 1
@@ -259,8 +255,7 @@ class DeepSpeedMoEInference(nn.Module):
                                                 mp_group,
                                                 quantize_scales,
                                                 quantize_groups,
-                                                merge_count,
-                                                qkv_merging)
+                                                merge_count)
         self.attn_nw = nn.Parameter(torch.Tensor(self.config.hidden_size))
         self.attn_nb = nn.Parameter(torch.Tensor(self.config.hidden_size))
 
diff --git a/deepspeed/ops/transformer/inference/op_binding/__init__.py b/deepspeed/ops/transformer/inference/op_binding/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..890290692e66350b6a8843f6444994a4a696116e
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/__init__.py
@@ -0,0 +1,10 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .linear import LinearOp
+from .vector_matmul import VectorMatMulOp
+from .softmax_context import SoftmaxContextOp
+from .qkv_gemm import QKVGemmOp
+from .softmax import SoftmaxOp
+from .mlp_gemm import MLPGemmOp
+from .gelu_gemm import GELUGemmOp
+from .residual_add import ResidualAddOp
diff --git a/deepspeed/ops/transformer/inference/op_binding/base.py b/deepspeed/ops/transformer/inference/op_binding/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad29e0f773cbbc33138de257711eebe8d7370fdc
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/base.py
@@ -0,0 +1,17 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+
+from deepspeed.ops.op_builder import InferenceBuilder
+
+
+class BaseOp(torch.nn.Module):
+    inference_cuda_module = None
+
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(BaseOp, self).__init__()
+        self.config = config
+        if BaseOp.inference_cuda_module is None:
+            builder = InferenceBuilder()
+            BaseOp.inference_cuda_module = builder.load()
diff --git a/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py b/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ab4ef92687029b94b246ad33aa583362ee66c8d
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py
@@ -0,0 +1,32 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class GELUGemmOp(BaseOp):
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(GELUGemmOp, self).__init__(config)
+        if self.config.fp16:
+            self.fused_gemm_gelu = self.inference_cuda_module.fused_gemm_gelu_fp16
+        else:
+            self.fused_gemm_gelu = self.inference_cuda_module.fused_gemm_gelu_fp32
+
+    def forward(self,
+                input: torch.Tensor,
+                weight: torch.Tensor,
+                bias: torch.Tensor,
+                weight_out: torch.Tensor,
+                async_op: bool = False):
+        output = self.fused_gemm_gelu(input,
+                                      weight,
+                                      weight.scale,
+                                      bias,
+                                      weight_out,
+                                      weight_out.scale,
+                                      self.config.epsilon,
+                                      self.config.pre_layer_norm,
+                                      self.config.q_int8,
+                                      async_op)
+        return output
diff --git a/deepspeed/ops/transformer/inference/op_binding/linear.py b/deepspeed/ops/transformer/inference/op_binding/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d83ffce22bdc266a079d40c53eab119ce4249d8
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/linear.py
@@ -0,0 +1,31 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class LinearOp(BaseOp):
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(LinearOp, self).__init__(config)
+        if self.config.fp16:
+            self.linear_func = self.inference_cuda_module.linear_layer_fp16
+        else:
+            self.linear_func = self.inference_cuda_module.linear_layer_fp32
+
+    def forward(self,
+                input: torch.Tensor,
+                weight: torch.Tensor,
+                bias: torch.Tensor,
+                add_bias: bool,
+                do_flash_attn: bool,
+                num_heads: int,
+                external_cache: bool = None,
+                num_layers: int = None):
+        qkv_out = self.linear_func(input,
+                                   weight,
+                                   bias,
+                                   add_bias,
+                                   do_flash_attn,
+                                   num_heads)
+        return qkv_out
diff --git a/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py b/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
new file mode 100644
index 0000000000000000000000000000000000000000..4df8ef52c3fb94b3af5ad8fc829e8b68e34f6dcb
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
@@ -0,0 +1,41 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class MLPGemmOp(BaseOp):
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(MLPGemmOp, self).__init__(config)
+        if self.config.fp16:
+            self.mlp_gemm_func = self.inference_cuda_module.mlp_gemm_fp16
+        else:
+            self.mlp_gemm_func = self.inference_cuda_module.mlp_gemm_fp32
+
+    def forward(self,
+                input: torch.Tensor,
+                residual: torch.Tensor,
+                input_bias: torch.Tensor,
+                weight_interm: torch.Tensor,
+                weight_out: torch.Tensor,
+                bias: torch.Tensor,
+                gamma: torch.Tensor,
+                beta: torch.Tensor):
+        output, residual_add = self.mlp_gemm_func(
+                                    input,
+                                    residual,
+                                    input_bias,
+                                    weight_interm,
+                                    weight_out,
+                                    bias,
+                                    gamma,
+                                    beta,
+                                    self.config.epsilon,
+                                    self.config.pre_layer_norm,
+                                    self.config.mlp_after_attn,
+                                    weight_interm.scale,
+                                    weight_out.scale,
+                                    self.config.q_int8,
+                                    self.config.mlp_act_func_type)
+        return output, residual_add
diff --git a/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py b/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
new file mode 100644
index 0000000000000000000000000000000000000000..d50dbfd3e7edcb1ab3ba5ce021135143fd5df90d
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
@@ -0,0 +1,44 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+from deepspeed import comm as dist
+
+
+class QKVGemmOp(BaseOp):
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(QKVGemmOp, self).__init__(config)
+        if self.config.fp16:
+            self.qkv_gemm_func = self.inference_cuda_module.qkv_gemm_fp16
+        else:
+            self.qkv_gemm_func = self.inference_cuda_module.qkv_gemm_fp32
+
+    def forward(self,
+                input: torch.Tensor,
+                weight: torch.Tensor,
+                bias: torch.Tensor,
+                gamma: torch.Tensor,
+                beta: torch.Tensor,
+                add_bias: bool,
+                num_layers: int,
+                num_heads: int = None,
+                max_out_tokens: int = None):
+        q_scale = weight.scale
+        external_cache = self.config.bigscience_bloom
+        rank = dist.get_rank() if dist.is_initialized() else 0
+        q_int8 = self.config.q_int8
+        output = self.qkv_gemm_func(input,
+                                    weight,
+                                    q_scale,
+                                    bias,
+                                    gamma,
+                                    beta,
+                                    self.config.epsilon,
+                                    add_bias,
+                                    num_layers,
+                                    external_cache,
+                                    self.config.mp_size,
+                                    rank,
+                                    q_int8)
+        return output
diff --git a/deepspeed/ops/transformer/inference/op_binding/residual_add.py b/deepspeed/ops/transformer/inference/op_binding/residual_add.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fb1741a0095cad76dbcf5a4571b30a70d02a5c3
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/residual_add.py
@@ -0,0 +1,38 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class ResidualAddOp(BaseOp):
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(ResidualAddOp, self).__init__(config)
+        if self.config.fp16 or self.config.q_int8:
+            self.residual_add_func = self.inference_cuda_module.residual_add_bias_fp16
+        else:
+            self.residual_add_func = self.inference_cuda_module.residual_add_bias_fp32
+
+    def forward(self,
+                hidden_state: torch.Tensor,
+                residual: torch.Tensor,
+                attention_output: torch.Tensor,
+                attention_bias: torch.Tensor,
+                final_bias: torch.Tensor,
+                add_bias: bool,
+                residual_add: torch.Tensor):
+
+        if not self.config.pre_layer_norm and residual_add is not None:
+            # only use residual add if its set and we are not pre layer norm
+            residual = residual_add
+
+        self.residual_add_func(hidden_state,
+                               residual,
+                               attention_output,
+                               attention_bias,
+                               final_bias,
+                               self.config.mp_size,
+                               self.config.mlp_after_attn,
+                               add_bias,
+                               self.config.pre_layer_norm)
+        return residual
diff --git a/deepspeed/ops/transformer/inference/op_binding/softmax.py b/deepspeed/ops/transformer/inference/op_binding/softmax.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d58ba4a4825246e4b37afb166db08072cc0af09
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/softmax.py
@@ -0,0 +1,41 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class SoftmaxOp(BaseOp):
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(SoftmaxOp, self).__init__(config)
+        if self.config.fp16:
+            self.softmax_func = self.inference_cuda_module.softmax_fp16
+        else:
+            self.softmax_func = self._not_implemented
+
+    def _not_implemented(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def forward(self,
+                attn_scores: torch.Tensor,
+                attn_mask: torch.Tensor,
+                alibi: torch.Tensor,
+                triangular: bool,
+                recompute: bool,
+                local_attention: bool,
+                window_size: int,
+                async_op: bool,
+                layer_scale: float,
+                head_offset: int):
+        output = self.softmax_func(attn_scores,
+                                   attn_mask,
+                                   alibi,
+                                   triangular,
+                                   recompute,
+                                   local_attention,
+                                   window_size,
+                                   async_op,
+                                   layer_scale,
+                                   head_offset,
+                                   self.config.mp_size)
+        return output
diff --git a/deepspeed/ops/transformer/inference/op_binding/softmax_context.py b/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..818af5f34a23de1637ccb572ce01008b5e5d82d4
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
@@ -0,0 +1,48 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from deepspeed import comm as dist
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class SoftmaxContextOp(BaseOp):
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(SoftmaxContextOp, self).__init__(config)
+        if self.config.fp16:
+            self.softmax_context_func = self.inference_cuda_module.softmax_context_fp16
+        else:
+            self.softmax_context_func = self.inference_cuda_module.softmax_context_fp32
+
+    def forward(self,
+                query_key_value: torch.Tensor,
+                attn_mask: torch.Tensor,
+                heads: int,
+                norm_factor: float,
+                no_masking: bool,
+                layer_id: int,
+                num_layers: int,
+                alibi: torch.Tensor):
+
+        if alibi is not None:
+            batch_heads = query_key_value.shape[0] * heads
+            offset = dist.get_rank() * batch_heads if dist.is_initialized() else 0
+            alibi = alibi[offset:batch_heads + offset, :, :]
+        else:
+            alibi = torch.empty(1)
+
+        output = self.softmax_context_func(query_key_value,
+                                           attn_mask,
+                                           self.config.rotary_dim,
+                                           self.config.rotate_half,
+                                           self.config.rotate_every_two,
+                                           heads,
+                                           norm_factor,
+                                           self.config.triangular_masking,
+                                           self.config.local_attention,
+                                           self.config.window_size,
+                                           no_masking,
+                                           layer_id,
+                                           num_layers,
+                                           alibi)
+        return output
diff --git a/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py b/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py
new file mode 100644
index 0000000000000000000000000000000000000000..c22b61cbd7e577e9ff4fb2aa028ea1fd32386efc
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py
@@ -0,0 +1,20 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class VectorMatMulOp(BaseOp):
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(VectorMatMulOp, self).__init__(config)
+        if self.config.fp16:
+            self.vector_matmul_func = self.inference_cuda_module.vector_matmul_fp16
+        else:
+            self.vector_matmul_func = self.inference_cuda_module.vector_matmul_fp32
+
+    def forward(self, input: torch.Tensor, weight: torch.Tensor, async_op: bool = False):
+        q_scale = weight.scale
+        q_int8 = self.config.q_int8
+        output = self.vector_matmul_func(input, weight, async_op, q_scale, q_int8)
+        return output
diff --git a/deepspeed/ops/transformer/inference/triton_ops.py b/deepspeed/ops/transformer/inference/triton_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..45c6db66d1078f9f5442807a6d4f757ad12adec6
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/triton_ops.py
@@ -0,0 +1,152 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+"""
+Inspired by original Triton implementation:
+https://github.com/openai/triton/blob/b244db06da24a87453a40ad35b085ee37dac3705/python/tutorials/06-fused-attention.py
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _fwd_kernel(
+    Q,
+    K,
+    V,
+    sm_scale,
+    TMP,
+    Out,
+    stride_qz,
+    stride_qh,
+    stride_qm,
+    stride_qk,
+    stride_kz,
+    stride_kh,
+    stride_kn,
+    stride_kk,
+    stride_vz,
+    stride_vh,
+    stride_vk,
+    stride_vn,
+    stride_oz,
+    stride_oh,
+    stride_om,
+    stride_on,
+    Z,
+    H,
+    N_CTX,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    off_q = off_hz * stride_qh + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
+    off_k = off_hz * stride_kh + offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kk
+    off_v = off_hz * stride_vh + offs_n[:, None] * stride_qm + offs_d[None, :] * stride_qk
+    # Initialize pointers to Q, K, V
+    q_ptrs = Q + off_q
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+    # initialize pointer to m and l
+    t_ptrs = TMP + off_hz * N_CTX + offs_m
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # load q: it will stay in SRAM throughout
+    q = tl.load(q_ptrs)
+    # loop over k, v and update accumulator
+    for start_n in range(0, N_CTX, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(k_ptrs + start_n * stride_kn)
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k, trans_b=True)
+        qk *= sm_scale
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        p = tl.exp(qk - m_ij[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+        m_i_new = tl.maximum(m_i, m_ij)
+        alpha = tl.exp(m_i - m_i_new)
+        beta = tl.exp(m_ij - m_i_new)
+        l_i_new = alpha * l_i + beta * l_ij
+        # -- update output accumulator --
+        # scale p
+        p_scale = beta / l_i_new
+        p = p * p_scale[:, None]
+        # scale acc
+        acc_scale = l_i / l_i_new * alpha
+        tl.store(t_ptrs, acc_scale)
+        acc_scale = tl.load(t_ptrs)  # BUG: have to store and immediately load
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(v_ptrs + start_n * stride_vk)
+        p = p.to(tl.float16)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+    # initialize pointers to output
+    offs_n = tl.arange(0, BLOCK_DMODEL)
+    off_o = off_hz * stride_oh + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs, acc)
+
+
+class triton_flash_attn(torch.nn.Module):
+    def __init__(self, ):
+        super(triton_flash_attn, self).__init__()
+
+    def forward(self, q, k, v, sm_scale, block_128=True):
+        BLOCK = 128 if block_128 else 64
+        # shape constraints
+        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+        o = torch.empty_like(q)
+        grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1])
+        tmp = torch.empty((q.shape[0] * q.shape[1],
+                           q.shape[2]),
+                          device=q.device,
+                          dtype=torch.float32)
+        num_warps = 4 if Lk <= 64 else 8
+
+        _fwd_kernel[grid](
+            q,
+            k,
+            v,
+            sm_scale,
+            tmp,
+            o,
+            q.stride(0),
+            q.stride(1),
+            q.stride(2),
+            q.stride(3),
+            k.stride(0),
+            k.stride(1),
+            k.stride(2),
+            k.stride(3),
+            v.stride(0),
+            v.stride(1),
+            v.stride(2),
+            v.stride(3),
+            o.stride(0),
+            o.stride(1),
+            o.stride(2),
+            o.stride(3),
+            k.shape[0],
+            k.shape[1],
+            k.shape[2],
+            BLOCK_M=BLOCK,
+            BLOCK_N=BLOCK,
+            BLOCK_DMODEL=Lk,
+            num_warps=num_warps,
+            num_stages=1,
+        )
+        return o
diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py
old mode 100644
new mode 100755
index 4b3104b6bdeadc8e22d19e912737a3dd20a47bf1..ae627304f02bbdd1d90f2859a20f2823154dc5db
--- a/deepspeed/ops/transformer/transformer.py
+++ b/deepspeed/ops/transformer/transformer.py
@@ -3,12 +3,11 @@ Copyright 2020 The Microsoft DeepSpeed Team
 '''
 import json
 import math
-import importlib
 import torch
 from torch import nn
 from torch.autograd import Function
-
-from ..op_builder import TransformerBuilder, StochasticTransformerBuilder
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import TransformerBuilder, StochasticTransformerBuilder
 
 # Cuda modules will be imported if needed
 transformer_cuda_module = None
@@ -218,7 +217,7 @@ class DeepSpeedTransformerFunction(Function):
                                          output_b,
                                          norm_w,
                                          norm_b,
-                                         config.training,
+                                         config.training and config.is_grad_enabled,
                                          config.pre_layer_norm,
                                          config.attn_dropout_checkpoint,
                                          config.normalize_invertible,
@@ -482,7 +481,7 @@ class DeepSpeedTransformerLayer(nn.Module):
         print("DeepSpeed Transformer config is ", self.config.__dict__)
 
         if self.config.local_rank >= 0:
-            torch.cuda.set_device(self.config.local_rank)
+            get_accelerator().set_device(self.config.local_rank)
 
         if initial_weights is None and initial_biases is None:
             self.attn_qkvw = nn.Parameter(
@@ -587,6 +586,7 @@ class DeepSpeedTransformerLayer(nn.Module):
                 output_attentions=False,
                 grads=None):
         self.config.is_grad_enabled = torch.is_grad_enabled()
+        self.config.training = self.training
         return DeepSpeedTransformerFunction.apply(hidden_states,
                                                   attention_mask,
                                                   self,
diff --git a/deepspeed/pipe/__init__.py b/deepspeed/pipe/__init__.py
index db1308172f08354cc8f8ebfc399f03f900502548..bbabf4feb7e03df2e5fe587ba5870ccd2cc57101 100644
--- a/deepspeed/pipe/__init__.py
+++ b/deepspeed/pipe/__init__.py
@@ -1 +1,3 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from ..runtime.pipe import PipelineModule, LayerSpec, TiedLayerSpec
diff --git a/deepspeed/profiling/__init__.py b/deepspeed/profiling/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..fcb45ab2b68516814a4bfbffebf2e01cbfefd527 100644
--- a/deepspeed/profiling/__init__.py
+++ b/deepspeed/profiling/__init__.py
@@ -0,0 +1 @@
+'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/profiling/config.py b/deepspeed/profiling/config.py
index 0671211132c626e2a0377218dd5fe75983acd888..c22cd453fcba7fc8e67b07d5b6ce70f6f3c1dd36 100644
--- a/deepspeed/profiling/config.py
+++ b/deepspeed/profiling/config.py
@@ -1,3 +1,4 @@
+'''Copyright The Microsoft DeepSpeed Team'''
 """
 Copyright (c) Microsoft Corporation
 Licensed under the MIT license.
diff --git a/deepspeed/profiling/constants.py b/deepspeed/profiling/constants.py
index d999dc61bd9fbe1aa56d4a5cf18d15d1b842f90e..162f1d3e7f2face649f088065ffeb42aa1452633 100644
--- a/deepspeed/profiling/constants.py
+++ b/deepspeed/profiling/constants.py
@@ -1,3 +1,4 @@
+'''Copyright The Microsoft DeepSpeed Team'''
 """
 Copyright (c) Microsoft Corporation
 Licensed under the MIT license.
diff --git a/deepspeed/profiling/flops_profiler/__init__.py b/deepspeed/profiling/flops_profiler/__init__.py
index 2f033c862baa41f9b440752182a27483a64ff116..7454821bc1ccfde04924ef3c04642e3b952409e9 100644
--- a/deepspeed/profiling/flops_profiler/__init__.py
+++ b/deepspeed/profiling/flops_profiler/__init__.py
@@ -1 +1,3 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from .profiler import *
diff --git a/deepspeed/profiling/flops_profiler/profiler.py b/deepspeed/profiling/flops_profiler/profiler.py
index 92c25ec94ca9516d051b635b102e84811d00a7c4..b6684f6978c370204b04cb09f1946c3b3b794f27 100644
--- a/deepspeed/profiling/flops_profiler/profiler.py
+++ b/deepspeed/profiling/flops_profiler/profiler.py
@@ -1,11 +1,14 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import time
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from functools import partial
-from typing import Callable, List, Optional, Tuple
+from typing import List, Optional
 from collections import OrderedDict
 import numpy as np
+from deepspeed.accelerator import get_accelerator
 
 Tensor = torch.Tensor
 
@@ -74,8 +77,9 @@ class FlopsProfiler(object):
 
             # if computing the flops of a module directly
             if type(module) in MODULE_HOOK_MAPPING:
-                module.__flops_handle__ = module.register_forward_hook(
-                    MODULE_HOOK_MAPPING[type(module)])
+                if not hasattr(module, "__flops_handle__"):
+                    module.__flops_handle__ = module.register_forward_hook(
+                        MODULE_HOOK_MAPPING[type(module)])
                 return
 
             # if computing the flops of the functionals in a module
@@ -83,7 +87,8 @@ class FlopsProfiler(object):
                 module_flop_count.append([])
                 module_mac_count.append([])
 
-            module.__pre_hook_handle__ = module.register_forward_pre_hook(pre_hook)
+            if not hasattr(module, "__pre_hook_handle__"):
+                module.__pre_hook_handle__ = module.register_forward_pre_hook(pre_hook)
 
             def post_hook(module, input, output):
                 if module_flop_count:
@@ -92,20 +97,24 @@ class FlopsProfiler(object):
                     module.__macs__ += sum([elem[1] for elem in module_mac_count[-1]])
                     module_mac_count.pop()
 
-            module.__post_hook_handle__ = module.register_forward_hook(post_hook)
+            if not hasattr(module, "__post_hook_handle__"):
+                module.__post_hook_handle__ = module.register_forward_hook(post_hook)
 
             def start_time_hook(module, input):
-                torch.cuda.synchronize()
+                get_accelerator().synchronize()
                 module.__start_time__ = time.time()
 
-            module.__start_time_hook_handle__ = module.register_forward_pre_hook(
-                start_time_hook)
+            if not hasattr(module, "__start_time_hook_handle"):
+                module.__start_time_hook_handle__ = module.register_forward_pre_hook(
+                    start_time_hook)
 
             def end_time_hook(module, input, output):
-                torch.cuda.synchronize()
+                get_accelerator().synchronize()
                 module.__duration__ += time.time() - module.__start_time__
 
-            module.__end_time_hook_handle__ = module.register_forward_hook(end_time_hook)
+            if not hasattr(module, "__end_time_hook_handle__"):
+                module.__end_time_hook_handle__ = module.register_forward_hook(
+                    end_time_hook)
 
         self.model.apply(partial(register_module_hooks, ignore_list=ignore_list))
         self.started = True
@@ -148,8 +157,7 @@ class FlopsProfiler(object):
         def add_or_reset_attrs(module):
             module.__flops__ = 0
             module.__macs__ = 0
-            module.__params__ = sum(p.numel() for p in module.parameters()
-                                    if p.requires_grad)
+            module.__params__ = sum(p.numel() for p in module.parameters())
             module.__start_time__ = 0
             module.__duration__ = 0
 
@@ -246,11 +254,10 @@ class FlopsProfiler(object):
             return
         import sys
         import os.path
-        from os import path
         original_stdout = None
         f = None
         if output_file and output_file != "":
-            dir_path = os.path.dirname(output_file)
+            dir_path = os.path.dirname(os.path.abspath(output_file))
             if not os.path.exists(dir_path):
                 os.makedirs(dir_path)
             original_stdout = sys.stdout
@@ -287,7 +294,7 @@ class FlopsProfiler(object):
         print('{:<60}  {:<8}'.format(
             'params of model = params per GPU * mp_size: ',
             params_to_string(total_params *
-                             (self.ds_engine.mp_world_size) if self.ds_engine else 1)))
+                             ((self.ds_engine.mp_world_size) if self.ds_engine else 1))))
 
         print('{:<60}  {:<8}'.format('fwd MACs per GPU: ', macs_to_string(total_macs)))
 
@@ -296,19 +303,19 @@ class FlopsProfiler(object):
         print('{:<60}  {:<8}'.format(
             'fwd flops of model = fwd flops per GPU * mp_size: ',
             num_to_string(total_flops *
-                          (self.ds_engine.mp_world_size) if self.ds_engine else 1)))
+                          ((self.ds_engine.mp_world_size) if self.ds_engine else 1))))
 
         fwd_latency = self.get_total_duration()
         if self.ds_engine and self.ds_engine.wall_clock_breakdown():
-            fwd_latency = self.ds_engine.timers('forward').elapsed(False)
+            fwd_latency = self.ds_engine.timers('forward').elapsed(False) / 1000.0
         print('{:<60}  {:<8}'.format('fwd latency: ', duration_to_string(fwd_latency)))
         print('{:<60}  {:<8}'.format(
             'fwd FLOPS per GPU = fwd flops per GPU / fwd latency: ',
             flops_to_string(total_flops / fwd_latency)))
 
         if self.ds_engine and self.ds_engine.wall_clock_breakdown():
-            bwd_latency = self.ds_engine.timers('backward').elapsed(False)
-            step_latency = self.ds_engine.timers('step').elapsed(False)
+            bwd_latency = self.ds_engine.timers('backward').elapsed(False) / 1000.0
+            step_latency = self.ds_engine.timers('step').elapsed(False) / 1000.0
             print('{:<60}  {:<8}'.format('bwd latency: ',
                                          duration_to_string(bwd_latency)))
             print('{:<60}  {:<8}'.format(
@@ -339,7 +346,7 @@ class FlopsProfiler(object):
             macs = get_module_macs(module)
             items = [
                 params_to_string(params),
-                "{:.2%} Params".format(params / total_params),
+                "{:.2%} Params".format(params / total_params if total_params else 0),
                 macs_to_string(macs),
                 "{:.2%} MACs".format(0.0 if total_macs == 0 else macs / total_macs),
             ]
@@ -476,50 +483,50 @@ def _prod(dims):
 
 def _linear_flops_compute(input, weight, bias=None):
     out_features = weight.shape[0]
-    macs = torch.numel(input) * out_features
+    macs = input.numel() * out_features
     return 2 * macs, macs
 
 
 def _relu_flops_compute(input, inplace=False):
-    return torch.numel(input), 0
+    return input.numel(), 0
 
 
 def _prelu_flops_compute(input: Tensor, weight: Tensor):
-    return torch.numel(input), 0
+    return input.numel(), 0
 
 
 def _elu_flops_compute(input: Tensor, alpha: float = 1.0, inplace: bool = False):
-    return torch.numel(input), 0
+    return input.numel(), 0
 
 
 def _leaky_relu_flops_compute(input: Tensor,
                               negative_slope: float = 0.01,
                               inplace: bool = False):
-    return torch.numel(input), 0
+    return input.numel(), 0
 
 
 def _relu6_flops_compute(input: Tensor, inplace: bool = False):
-    return torch.numel(input), 0
+    return input.numel(), 0
 
 
 def _silu_flops_compute(input: Tensor, inplace: bool = False):
-    return torch.numel(input), 0
+    return input.numel(), 0
 
 
-def _gelu_flops_compute(input):
-    return torch.numel(input), 0
+def _gelu_flops_compute(input, **kwargs):
+    return input.numel(), 0
 
 
-def _pool_flops_compute(
-    input,
-    kernel_size,
-    stride=None,
-    padding=0,
-    ceil_mode=False,
-    count_include_pad=True,
-    divisor_override=None,
-):
-    return torch.numel(input), 0
+def _pool_flops_compute(input,
+                        kernel_size,
+                        stride=None,
+                        padding=0,
+                        dilation=None,
+                        ceil_mode=False,
+                        count_include_pad=True,
+                        divisor_override=None,
+                        return_indices=None):
+    return input.numel(), 0
 
 
 def _conv_flops_compute(input,
@@ -534,7 +541,7 @@ def _conv_flops_compute(input,
     batch_size = input.shape[0]
     in_channels = input.shape[1]
     out_channels = weight.shape[0]
-    kernel_dims = list(weight.shape[-2:])
+    kernel_dims = list(weight.shape[2:])
     input_dims = list(input.shape[2:])
 
     length = len(input_dims)
@@ -575,7 +582,7 @@ def _conv_trans_flops_compute(
     batch_size = input.shape[0]
     in_channels = input.shape[1]
     out_channels = weight.shape[0]
-    kernel_dims = list(weight.shape[-2:])
+    kernel_dims = list(weight.shape[2:])
     input_dims = list(input.shape[2:])
 
     length = len(input_dims)
@@ -621,8 +628,8 @@ def _batch_norm_flops_compute(
     has_affine = weight is not None
     if training:
         # estimation
-        return torch.numel(input) * (5 if has_affine else 4), 0
-    flops = torch.numel(input) * (2 if has_affine else 1)
+        return input.numel() * (5 if has_affine else 4), 0
+    flops = input.numel() * (2 if has_affine else 1)
     return flops, 0
 
 
@@ -635,7 +642,7 @@ def _layer_norm_flops_compute(
 ):
     has_affine = weight is not None
     # estimation
-    return torch.numel(input) * (5 if has_affine else 4), 0
+    return input.numel() * (5 if has_affine else 4), 0
 
 
 def _group_norm_flops_compute(input: Tensor,
@@ -645,7 +652,7 @@ def _group_norm_flops_compute(input: Tensor,
                               eps: float = 1e-5):
     has_affine = weight is not None
     # estimation
-    return torch.numel(input) * (5 if has_affine else 4), 0
+    return input.numel() * (5 if has_affine else 4), 0
 
 
 def _instance_norm_flops_compute(
@@ -660,21 +667,19 @@ def _instance_norm_flops_compute(
 ):
     has_affine = weight is not None
     # estimation
-    return torch.numel(input) * (5 if has_affine else 4), 0
+    return input.numel() * (5 if has_affine else 4), 0
 
 
-def _upsample_flops_compute(input,
-                            size=None,
-                            scale_factor=None,
-                            mode="nearest",
-                            align_corners=None):
+def _upsample_flops_compute(input, **kwargs):
+    size = kwargs.get('size', None)
     if size is not None:
-        if isinstance(size, tuple):
+        if isinstance(size, tuple) or isinstance(size, list):
             return int(_prod(size)), 0
         else:
             return int(size), 0
+    scale_factor = kwargs.get('scale_factor', None)
     assert scale_factor is not None, "either size or scale_factor should be defined"
-    flops = torch.numel(input)
+    flops = input.numel()
     if isinstance(scale_factor, tuple) and len(scale_factor) == len(input):
         flops * int(_prod(scale_factor))
     else:
@@ -683,7 +688,7 @@ def _upsample_flops_compute(input,
 
 
 def _softmax_flops_compute(input, dim=None, _stacklevel=3, dtype=None):
-    return torch.numel(input), 0
+    return input.numel(), 0
 
 
 def _embedding_flops_compute(
@@ -783,7 +788,7 @@ def _elementwise_flops_compute(input, other):
 
 def wrapFunc(func, funcFlopCompute):
     oldFunc = func
-    name = func.__name__
+    name = func.__str__
     old_functions[name] = oldFunc
 
     def newFunc(*args, **kwds):
@@ -794,7 +799,7 @@ def wrapFunc(func, funcFlopCompute):
             module_mac_count[-1].append((name, macs))
         return oldFunc(*args, **kwds)
 
-    newFunc.__name__ = func.__name__
+    newFunc.__str__ = func.__str__
 
     return newFunc
 
@@ -860,7 +865,7 @@ def _patch_tensor_methods():
     torch.mm = wrapFunc(torch.mm, _matmul_flops_compute)
     torch.Tensor.mm = wrapFunc(torch.Tensor.mm, _matmul_flops_compute)
     torch.bmm = wrapFunc(torch.bmm, _matmul_flops_compute)
-    torch.Tensor.bmm = wrapFunc(torch.bmm, _matmul_flops_compute)
+    torch.Tensor.bmm = wrapFunc(torch.Tensor.bmm, _matmul_flops_compute)
 
     torch.addmm = wrapFunc(torch.addmm, _addmm_flops_compute)
     torch.Tensor.addmm = wrapFunc(torch.Tensor.addmm, _tensor_addmm_flops_compute)
@@ -873,42 +878,65 @@ def _patch_tensor_methods():
 
     torch.einsum = wrapFunc(torch.einsum, _einsum_flops_compute)
 
+    torch.baddbmm = wrapFunc(torch.baddbmm, _tensor_addmm_flops_compute)
+
 
 def _reload_functionals():
     # torch.nn.functional does not support importlib.reload()
-    F.linear = old_functions[F.linear.__name__]
-    F.conv1d = old_functions[F.conv1d.__name__]
-    F.conv2d = old_functions[F.conv2d.__name__]
-    F.conv3d = old_functions[F.conv3d.__name__]
-    F.conv_transpose1d = old_functions[F.conv_transpose1d.__name__]
-    F.conv_transpose2d = old_functions[F.conv_transpose2d.__name__]
-    F.conv_transpose3d = old_functions[F.conv_transpose3d.__name__]
-    F.relu = old_functions[F.relu.__name__]
-    F.prelu = old_functions[F.prelu.__name__]
-    F.elu = old_functions[F.elu.__name__]
-    F.leaky_relu = old_functions[F.leaky_relu.__name__]
-    F.relu6 = old_functions[F.relu6.__name__]
-    F.batch_norm = old_functions[F.batch_norm.__name__]
-    F.avg_pool1d = old_functions[F.avg_pool1d.__name__]
-    F.avg_pool2d = old_functions[F.avg_pool2d.__name__]
-    F.avg_pool3d = old_functions[F.avg_pool3d.__name__]
-    F.max_pool1d = old_functions[F.max_pool1d.__name__]
-    F.max_pool2d = old_functions[F.max_pool2d.__name__]
-    F.max_pool3d = old_functions[F.max_pool3d.__name__]
-    F.adaptive_avg_pool1d = old_functions[F.adaptive_avg_pool1d.__name__]
-    F.adaptive_avg_pool2d = old_functions[F.adaptive_avg_pool2d.__name__]
-    F.adaptive_avg_pool3d = old_functions[F.adaptive_avg_pool3d.__name__]
-    F.adaptive_max_pool1d = old_functions[F.adaptive_max_pool1d.__name__]
-    F.adaptive_max_pool2d = old_functions[F.adaptive_max_pool2d.__name__]
-    F.adaptive_max_pool3d = old_functions[F.adaptive_max_pool3d.__name__]
-    F.upsample = old_functions[F.upsample.__name__]
-    F.interpolate = old_functions[F.interpolate.__name__]
-    F.softmax = old_functions[F.softmax.__name__]
-    F.embedding = old_functions[F.embedding.__name__]
+    F.linear = old_functions[F.linear.__str__]
+    F.conv1d = old_functions[F.conv1d.__str__]
+    F.conv2d = old_functions[F.conv2d.__str__]
+    F.conv3d = old_functions[F.conv3d.__str__]
+    F.conv_transpose1d = old_functions[F.conv_transpose1d.__str__]
+    F.conv_transpose2d = old_functions[F.conv_transpose2d.__str__]
+    F.conv_transpose3d = old_functions[F.conv_transpose3d.__str__]
+    F.relu = old_functions[F.relu.__str__]
+    F.prelu = old_functions[F.prelu.__str__]
+    F.elu = old_functions[F.elu.__str__]
+    F.leaky_relu = old_functions[F.leaky_relu.__str__]
+    F.relu6 = old_functions[F.relu6.__str__]
+    if hasattr(F, "silu"):
+        F.silu = old_functions[F.silu.__str__]
+    F.gelu = old_functions[F.gelu.__str__]
+    F.batch_norm = old_functions[F.batch_norm.__str__]
+    F.layer_norm = old_functions[F.layer_norm.__str__]
+    F.instance_norm = old_functions[F.instance_norm.__str__]
+    F.group_norm = old_functions[F.group_norm.__str__]
+    F.avg_pool1d = old_functions[F.avg_pool1d.__str__]
+    F.avg_pool2d = old_functions[F.avg_pool2d.__str__]
+    F.avg_pool3d = old_functions[F.avg_pool3d.__str__]
+    F.max_pool1d = old_functions[F.max_pool1d.__str__]
+    F.max_pool2d = old_functions[F.max_pool2d.__str__]
+    F.max_pool3d = old_functions[F.max_pool3d.__str__]
+    F.adaptive_avg_pool1d = old_functions[F.adaptive_avg_pool1d.__str__]
+    F.adaptive_avg_pool2d = old_functions[F.adaptive_avg_pool2d.__str__]
+    F.adaptive_avg_pool3d = old_functions[F.adaptive_avg_pool3d.__str__]
+    F.adaptive_max_pool1d = old_functions[F.adaptive_max_pool1d.__str__]
+    F.adaptive_max_pool2d = old_functions[F.adaptive_max_pool2d.__str__]
+    F.adaptive_max_pool3d = old_functions[F.adaptive_max_pool3d.__str__]
+    F.upsample = old_functions[F.upsample.__str__]
+    F.interpolate = old_functions[F.interpolate.__str__]
+    F.softmax = old_functions[F.softmax.__str__]
+    F.embedding = old_functions[F.embedding.__str__]
 
 
 def _reload_tensor_methods():
-    torch.matmul = old_functions[torch.matmul.__name__]
+    torch.matmul = old_functions[torch.matmul.__str__]
+    torch.Tensor.matmul = old_functions[torch.Tensor.matmul.__str__]
+    torch.mm = old_functions[torch.mm.__str__]
+    torch.Tensor.mm = old_functions[torch.Tensor.mm.__str__]
+    torch.bmm = old_functions[torch.matmul.__str__]
+    torch.Tensor.bmm = old_functions[torch.Tensor.bmm.__str__]
+    torch.addmm = old_functions[torch.addmm.__str__]
+    torch.Tensor.addmm = old_functions[torch.Tensor.addmm.__str__]
+    torch.mul = old_functions[torch.mul.__str__]
+    torch.Tensor.mul = old_functions[torch.Tensor.mul.__str__]
+    torch.add = old_functions[torch.add.__str__]
+    torch.Tensor.add = old_functions[torch.Tensor.add.__str__]
+
+    torch.einsum = old_functions[torch.einsum.__str__]
+
+    torch.baddbmm = old_functions[torch.baddbmm.__str__]
 
 
 def _rnn_flops(flops, rnn_module, w_ih, w_hh, input_size):
@@ -1191,15 +1219,19 @@ def get_model_profile(
             input = torch.ones(()).new_empty((*input_shape, ))
 
         args = [input]
-
     assert (len(args) > 0) or (len(kwargs) > 0), "args and/or kwargs must be specified if input_shape is None"
 
     for _ in range(warm_up):
-        _ = model(*args, **kwargs)
-
+        if kwargs:
+            _ = model(*args, **kwargs)
+        else:
+            _ = model(*args)
     prof.start_profile(ignore_list=ignore_modules)
 
-    _ = model(*args, **kwargs)
+    if kwargs:
+        _ = model(*args, **kwargs)
+    else:
+        _ = model(*args)
 
     flops = prof.get_total_flops()
     macs = prof.get_total_macs()
diff --git a/deepspeed/runtime/__init__.py b/deepspeed/runtime/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..aa1f39f9e194699befc81c01eb057de2150549b7 100644
--- a/deepspeed/runtime/__init__.py
+++ b/deepspeed/runtime/__init__.py
@@ -0,0 +1,9 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+
+class DeepSpeedOptimizer(object):
+    pass
+
+
+class ZeROOptimizer(DeepSpeedOptimizer):
+    pass
diff --git a/deepspeed/runtime/activation_checkpointing/__init__.py b/deepspeed/runtime/activation_checkpointing/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..fcb45ab2b68516814a4bfbffebf2e01cbfefd527 100644
--- a/deepspeed/runtime/activation_checkpointing/__init__.py
+++ b/deepspeed/runtime/activation_checkpointing/__init__.py
@@ -0,0 +1 @@
+'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index 06d1a2a59833f74e6e7dfc2f2a84e780a05cbd8d..74372540106f0c3ab7bfdf2e9d647aa303d10574 100644
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -1,3 +1,4 @@
+'''Copyright The Microsoft DeepSpeed Team'''
 '''
 Copyright (c) Microsoft Corporation
 Licensed under the MIT license.
@@ -16,16 +17,16 @@ b886b7bb972afe72bac0f5de4f42a4a7bae8ebef
 import copy
 import torch
 import contextlib
-import torch.distributed as dist
+from deepspeed import comm as dist
 
 import mmap
 from torch import _C
-from torch.cuda import _lazy_call, device as device_ctx_manager
 
 from deepspeed.runtime.config import DeepSpeedConfig
 from deepspeed.utils import logger
 from deepspeed.runtime.utils import copy_to_device, move_to_device, see_memory_usage, bwc_tensor_model_parallel_rank
 from deepspeed.utils.timer import SynchronizedWallClockTimer as Timers
+from deepspeed.accelerator import get_accelerator
 
 # DeepSpeed Checkpointing Enabled or Disabled
 deepspeed_checkpointing_enabled = False
@@ -98,25 +99,25 @@ def _set_cuda_rng_state(new_state, device=-1):
     if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
         # older PyTorch
         def cb():
-            with device_ctx_manager(device):
+            with get_accelerator().device(device):
                 _C._cuda_setRNGState(new_state)
     else:
         # newer PyTorch
         if device == -1:
-            device = torch.device('cuda')
+            device = torch.device(get_accelerator().device_name())
         elif isinstance(device, str):
             device = torch.device(device)
         elif isinstance(device, int):
-            device = torch.device('cuda', device)
+            device = torch.device(get_accelerator().device_name(), device)
 
         def cb():
             idx = device.index
             if idx is None:
-                idx = torch.cuda.current_device()
-            default_generator = torch.cuda.default_generators[idx]
+                idx = get_accelerator().current_device()
+            default_generator = get_accelerator().default_generator(idx)
             default_generator.set_state(new_state)
 
-    _lazy_call(cb)
+    get_accelerator().lazy_call(cb)
 
 
 class CudaRNGStatesTracker:
@@ -158,10 +159,10 @@ class CudaRNGStatesTracker:
         if name in self.states_:
             raise Exception('cuda rng state {} already exists'.format(name))
         # Get the current rng state.
-        orig_rng_state = torch.cuda.get_rng_state()
+        orig_rng_state = get_accelerator().get_rng_state()
         # Set the new state and store it.
-        torch.cuda.manual_seed(seed)
-        self.states_[name] = torch.cuda.get_rng_state()
+        get_accelerator().manual_seed(seed)
+        self.states_[name] = get_accelerator().get_rng_state()
         # Reset rng state to what it was.
         _set_cuda_rng_state(orig_rng_state)
 
@@ -173,7 +174,7 @@ class CudaRNGStatesTracker:
         if name not in self.states_:
             raise Exception('cuda rng state {} is not added'.format(name))
         # Store current rng state.
-        orig_cuda_rng_state = torch.cuda.get_rng_state()
+        orig_cuda_rng_state = get_accelerator().get_rng_state()
         # Set rng state to the desired one
         _set_cuda_rng_state(self.states_[name])
         # Do the stuff we wanted to do.
@@ -181,7 +182,7 @@ class CudaRNGStatesTracker:
             yield
         finally:
             # Update the current rng state for later use.
-            self.states_[name] = torch.cuda.get_rng_state()
+            self.states_[name] = get_accelerator().get_rng_state()
             # And set the state to the original state we started with.
             _set_cuda_rng_state(orig_cuda_rng_state)
 
@@ -199,7 +200,7 @@ def model_parallel_cuda_manual_seed(seed):
     """Initialize model parallel cuda seed.
 
     This function should be called after the model parallel is
-    initialized. Also, no torch.cuda.manual_seed should be called
+    initialized. Also, no get_accelerator().manual_seed should be called
     after this function. Basically, this is replacement for that
     function.
     Two set of RNG states are tracked:
@@ -222,12 +223,12 @@ def model_parallel_cuda_manual_seed(seed):
     # Data parallel gets the original seed.
     data_parallel_seed = seed
 
-    if torch.distributed.get_rank() == 0:
+    if dist.get_rank() == 0:
         logger.info(
             '> initializing model parallel cuda seeds on global rank {}, '
             'model parallel rank {}, and data parallel rank {} with '
             'model parallel seed: {} and data parallel seed: {}'.format(
-                torch.distributed.get_rank(),
+                dist.get_rank(),
                 tp_rank,
                 mpu.get_data_parallel_rank(),
                 model_parallel_seed,
@@ -235,7 +236,7 @@ def model_parallel_cuda_manual_seed(seed):
         )
     _CUDA_RNG_STATE_TRACKER.reset()
     # Set the default state.
-    torch.cuda.manual_seed(data_parallel_seed)
+    get_accelerator().manual_seed(data_parallel_seed)
     # and model parallel state.
     _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME, model_parallel_seed)
 
@@ -270,6 +271,12 @@ def gather_partitioned_activations(tensors, device=None):
             inputs.append(item)
             continue
 
+        # don't need to do all_gather if model parallel is not enabled
+        if mp_group is None or mp_size == 1:
+            item = item.view(list(size.numpy()))
+            inputs.append(item)
+            continue
+
         partition_size = item.numel()
         tensor_size = partition_size * mp_size
         if device is not None:
@@ -284,8 +291,7 @@ def gather_partitioned_activations(tensors, device=None):
             if i == mp_rank:
                 part_i.copy_(item)
             partitions.append(part_i)
-        if mp_group is not None:
-            dist.all_gather(partitions, partitions[mp_rank], group=mp_group)
+        dist.all_gather(partitions, partitions[mp_rank], group=mp_group)
         input_tensor = flat_tensor.view(list(size.numpy()))
         item.data = input_tensor.data
 
@@ -511,7 +517,7 @@ class CheckpointFunction(torch.autograd.Function):
             ctx.tensor_flags = tensor_flags
 
         if SYNCHRONIZE:
-            torch.cuda.synchronize()
+            get_accelerator().synchronize()
 
         if timers is None and PROFILE_TIME:
             timers = Timers()
@@ -554,8 +560,8 @@ class CheckpointFunction(torch.autograd.Function):
                 logger.info(f"----Synchronization {SYNCHRONIZE}")
                 logger.info(f"----Profiling time in checkpointing {PROFILE_TIME}")
 
-            cuda_device = torch.cuda.current_device()
-            transport_stream = torch.cuda.Stream(device=cuda_device)
+            cuda_device = get_accelerator().current_device_name()
+            transport_stream = get_accelerator().Stream(device=cuda_device)
 
         if PARTITION_ACTIVATIONS:
             inputs = partition_activations(args,
@@ -573,7 +579,7 @@ class CheckpointFunction(torch.autograd.Function):
 
         # Copy the rng states.
         ctx.fwd_cpu_rng_state = torch.get_rng_state()
-        ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
+        ctx.fwd_cuda_rng_state = get_accelerator().get_rng_state()
         ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
 
         see_memory_usage("Before running forward on the layer", force=False)
@@ -601,7 +607,7 @@ class CheckpointFunction(torch.autograd.Function):
             timers('forward').stop()
             timers.log(['forward'])
         if SYNCHRONIZE:
-            torch.cuda.synchronize()
+            get_accelerator().synchronize()
 
         # Tensors returned from forward() may not be differentiable.
         if torch.is_tensor(outputs):
@@ -628,7 +634,7 @@ class CheckpointFunction(torch.autograd.Function):
         # so that they can be garbage collected once the checkpoints
         # have been used
         if SYNCHRONIZE:
-            torch.cuda.synchronize()
+            get_accelerator().synchronize()
         if PROFILE_TIME:
             timers('backward').start()
 
@@ -654,7 +660,7 @@ class CheckpointFunction(torch.autograd.Function):
         global cuda_device, transport_stream, PARTITION_ACTIVATIONS
 
         if PARTITION_ACTIVATIONS:
-            # with torch.cuda.stream(transport_stream):
+            # with get_accelerator().stream(transport_stream):
             inputs = gather_partitioned_activations(
                 ctx.deepspeed_saved_tensors,
                 device=cuda_device if CPU_CHECKPOINT else None)
@@ -675,7 +681,7 @@ class CheckpointFunction(torch.autograd.Function):
 
         # Store the current states.
         bwd_cpu_rng_state = torch.get_rng_state()
-        bwd_cuda_rng_state = torch.cuda.get_rng_state()
+        bwd_cuda_rng_state = get_accelerator().get_rng_state()
         bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
 
         # Set the states to what it used to be before the forward pass.
@@ -684,7 +690,7 @@ class CheckpointFunction(torch.autograd.Function):
         get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
 
         # if PARTITION_ACTIVATIONS:
-        #     current_stream=torch.cuda.current_stream()
+        #     current_stream=get_accelerator().current_stream()
         #     current_stream.wait_stream(transport_stream)
 
         see_memory_usage("In backward checkpointing code before forward", force=False)
@@ -729,7 +735,7 @@ class CheckpointFunction(torch.autograd.Function):
             timers('backward').stop()
             timers.log(['backward'])
         if SYNCHRONIZE:
-            torch.cuda.synchronize()
+            get_accelerator().synchronize()
         ret_list = [None, None]  # first None for ctx
         for inp in detached_inputs:
             if torch.is_tensor(inp):
@@ -856,7 +862,7 @@ def configure(
         checkpoint_in_cpu: Optional: Moves the activation checkpoint to CPU. Only works with
             partition_activation. Default is false. Will overwrite deepspeed_config if provided
 
-        synchronize: Optional: Performs torch.cuda.synchronize() at the beginning and end of
+        synchronize: Optional: Performs get_accelerator().synchronize() at the beginning and end of
             each call to deepspeed.checkpointing.checkpoint for both forward and backward pass.
             By default false. Will overwrite deepspeed_config if provided
 
diff --git a/deepspeed/runtime/activation_checkpointing/config.py b/deepspeed/runtime/activation_checkpointing/config.py
old mode 100644
new mode 100755
index 0ab59ac64eeadaa77e29e6b51f010cc1e5a3cc3b..0e79579435b82134676dc47a8c3f625ab3acfd81
--- a/deepspeed/runtime/activation_checkpointing/config.py
+++ b/deepspeed/runtime/activation_checkpointing/config.py
@@ -1,3 +1,4 @@
+'''Copyright The Microsoft DeepSpeed Team'''
 """
 Copyright (c) Microsoft Corporation
 Licensed under the MIT license.
diff --git a/deepspeed/runtime/bf16_optimizer.py b/deepspeed/runtime/bf16_optimizer.py
index 43c23f0b77f1ace5720c6160ced5ae7b95c97163..f667d279ca542bd32dcea0930bdb1c8b1a2b7325 100644
--- a/deepspeed/runtime/bf16_optimizer.py
+++ b/deepspeed/runtime/bf16_optimizer.py
@@ -1,80 +1,44 @@
+"""
+Copyright 2022 The Microsoft DeepSpeed Team
+"""
+
+from collections import OrderedDict
 import torch
-import torch.distributed as dist
+import sys
+import os
+from deepspeed import comm as dist
 from deepspeed.runtime.constants import PIPE_REPLICATED
 from deepspeed.ops.op_builder import UtilsBuilder
+from deepspeed.runtime import ZeROOptimizer
 from packaging import version as pkg_version
 
 from deepspeed.git_version_info import version
 from deepspeed.runtime.utils import (get_global_norm_of_tensors,
                                      clip_tensors_by_global_norm,
-                                     get_grad_norm,
-                                     clip_gradients,
+                                     DummyOptim,
                                      align_dense_tensors,
                                      all_gather_dp_groups,
                                      bwc_tensor_model_parallel_rank,
                                      is_model_parallel_parameter,
                                      see_memory_usage)
 
+from deepspeed.utils import link_hp_params, fragment_address
+from deepspeed.checkpoint import enable_universal_checkpoint
 from deepspeed.checkpoint.constants import (DS_VERSION,
                                             PARTITION_COUNT,
                                             BASE_OPTIMIZER_STATE,
                                             SINGLE_PARTITION_OF_FP32_GROUPS,
                                             CLIP_GRAD,
-                                            GROUPS_PADDING)
-
-import types
-
-from dataclasses import dataclass
-
-
-@dataclass
-class fragment_address:
-    numel: int
-    start: int
-
-
-@dataclass
-class tensor_fragment:
-    lp_fragment: torch.Tensor
-    lp_fragment_address: fragment_address
-    hp_fragment: torch.Tensor
-    hp_fragment_address: fragment_address
-    optim_fragment: {}
-
-    def update_hp(self):
-        self.hp_fragment.data.copy_(self.lp_fragment.data)
-
-    def update_lp(self):
-        self.lp_fragment.data.copy_(self.hp_fragment.data)
+                                            GROUP_PADDINGS,
+                                            PARAM_SLICE_MAPPINGS)
 
-    def get_optim_state_fragment(self, key):
-        if key in self.optim_fragment:
-            return self.optim_fragment[key]
-        else:
-            raise ValueError(f'{key} not found in optimizer state fragment')
-
-
-def get_full_hp_param(self, optim_state_key=None):
-    reduce_buffer = torch.zeros_like(self, dtype=torch.float32).flatten()
-    if self._hp_mapping is not None:
-        lp_frag_address = self._hp_mapping.lp_fragment_address
-        reduce_fragment = torch.narrow(reduce_buffer,
-                                       0,
-                                       lp_frag_address.start,
-                                       lp_frag_address.numel)
-        if optim_state_key is None:
-            hp_fragment = self._hp_mapping.hp_fragment
-        else:
-            hp_fragment = self._hp_mapping.get_optim_state_fragment(optim_state_key)
-
-        reduce_fragment.data.copy_(hp_fragment.data)
-    torch.distributed.all_reduce(reduce_buffer, group=self._dp_group)
-    return reduce_buffer.reshape_as(self)
+setattr(sys.modules[__name__], 'fragment_address', fragment_address)
 
 
-class BF16_Optimizer:
+class BF16_Optimizer(ZeROOptimizer):
     def __init__(self,
                  init_optimizer,
+                 param_names,
                  mpu=None,
                  clip_grad=0.0,
                  norm_type=2,
@@ -85,6 +49,9 @@ class BF16_Optimizer:
         see_memory_usage('begin bf16_optimizer', force=True)
         self.timers = timers
         self.optimizer = init_optimizer
+        self.param_names = param_names
+        self.using_real_optimizer = not isinstance(self.optimizer, DummyOptim)
+
         self.clip_grad = clip_grad
         self.norm_type = norm_type
         self.mpu = mpu
@@ -94,10 +61,6 @@ class BF16_Optimizer:
         self.real_dp_process_group = [
             dp_process_group for i in range(len(self.optimizer.param_groups))
         ]
-        dp_world_size = dist.get_world_size(group=self.dp_process_group)
-        self.partition_count = [
-            dp_world_size for i in range(len(self.optimizer.param_groups))
-        ]
 
         # Load pre-built or JIT compile (un)flatten ops
         util_ops = UtilsBuilder().load()
@@ -116,13 +79,25 @@ class BF16_Optimizer:
 
         # Maintain different fp32 gradients views for convenience
         self.fp32_groups_gradients = []
+        self.fp32_groups_gradient_dict = {}
         self.fp32_groups_gradients_flat = []
         self.fp32_groups_actual_gradients_flat = []
         self.fp32_groups_gradient_flat_partition = []
         self.fp32_groups_has_gradients = []
 
         self.step_count = 0
-        self.groups_padding = []
+        self.group_paddings = []
+
+        if self.using_real_optimizer:
+            self._setup_for_real_optimizer()
+
+        see_memory_usage('end bf16_optimizer', force=True)
+
+    def _setup_for_real_optimizer(self):
+        dp_world_size = dist.get_world_size(group=self.dp_process_group)
+        self.partition_count = [
+            dp_world_size for i in range(len(self.optimizer.param_groups))
+        ]
 
         for i, param_group in enumerate(self.optimizer.param_groups):
             see_memory_usage(f'before initializing group {i}', force=True)
@@ -170,6 +145,7 @@ class BF16_Optimizer:
                 flat_tensor=self.fp32_groups_gradients_flat[i],
                 num_elem_list=num_elem_list)
             self.fp32_groups_gradients.append(fp32_gradients)
+            self.fp32_groups_gradient_dict[i] = fp32_gradients
 
             # flat tensor corresponding to actual fp32 gradients (i.e., minus alignment padding)
             length_without_padding = sum(num_elem_list)
@@ -196,7 +172,7 @@ class BF16_Optimizer:
             else:
                 padding = 0
 
-            self.groups_padding.append(padding)
+            self.group_paddings.append(padding)
 
             # update optimizer param groups to reference fp32 params partition
             param_group['params'] = [self.fp32_groups_flat_partition[i]]
@@ -209,96 +185,44 @@ class BF16_Optimizer:
 
         # Need optimizer states initialized before linking lp to optimizer state
         self._link_all_hp_params()
-
-        see_memory_usage('end bf16_optimizer', force=True)
+        self._enable_universal_checkpoint()
+        self._param_slice_mappings = self._create_param_mapping()
+
+    def _enable_universal_checkpoint(self):
+        for lp_param_group in self.bf16_groups:
+            enable_universal_checkpoint(param_list=lp_param_group)
+
+    def _create_param_mapping(self):
+        param_mapping = []
+        for i, _ in enumerate(self.optimizer.param_groups):
+            param_mapping_per_group = OrderedDict()
+            for lp in self.bf16_groups[i]:
+                if lp._hp_mapping is not None:
+                    lp_name = self.param_names[lp]
+                    param_mapping_per_group[
+                        lp_name] = lp._hp_mapping.get_hp_fragment_address()
+            param_mapping.append(param_mapping_per_group)
+
+        return param_mapping
 
     def _link_all_hp_params(self):
         dp_world_size = dist.get_world_size(group=self.dp_process_group)
-        for i, param_group in enumerate(self.optimizer.param_groups):
+        for i, _ in enumerate(self.optimizer.param_groups):
             # Link bf16 and fp32 params in partition
-            # TODO: Make this configurable
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
             partition_size = self.bf16_groups_flat[i].numel() // dp_world_size
-            self._link_hp_params(self.bf16_groups[i],
-                                 self.fp32_groups_flat_partition[i],
-                                 partition_id * partition_size,
-                                 partition_size,
-                                 self.real_dp_process_group[i])
-
-    def _init_lp_to_hp_mapping(self,
-                               lp_param_list,
-                               partition_start,
-                               partition_size,
-                               dp_group):
-        current_offset = 0
-        param_and_offset_list = []
-        partition_end = partition_start + partition_size
-        for lp_param in lp_param_list:
-            lp_param._hp_mapping = None
-            lp_param._dp_group = dp_group
-            lp_param.get_full_hp_param = types.MethodType(get_full_hp_param, lp_param)
-            # lp_param overlaps with partition if both are true
-            # 1) current_offset < partition_end,
-            # 2) current_offset + lp_param.numel() >= partition_start
-            lp_param_end = current_offset + lp_param.numel()
-            if current_offset < partition_end and lp_param_end > partition_start:
-                param_and_offset_list.append((lp_param, current_offset))
-            current_offset += lp_param.numel()
-
-        return param_and_offset_list
-
-    def _link_hp_params(self,
-                        lp_param_list,
-                        flat_hp_partition,
-                        partition_start,
-                        partition_size,
-                        dp_group):
-        local_lp_param_and_offset = self._init_lp_to_hp_mapping(
-            lp_param_list,
-            partition_start,
-            partition_size,
-            dp_group)
-
-        hp_end = partition_start + partition_size
-        for lp_param, lp_start in local_lp_param_and_offset:
-            lp_end = lp_param.numel() + lp_start
-            hp_start = partition_start
-
-            fragment_start = max(lp_start, hp_start)
-            fragment_end = min(lp_end, hp_end)
-            #            print(
-            #                f'{self.dp_rank=} {lp_start=} {lp_end-lp_start=} {hp_start=} {hp_end-hp_start=} {fragment_start=} {fragment_end-fragment_start=}'
-            #            )
-            assert fragment_start < fragment_end, \
-                f'fragment start {fragment_start} should be < fragment_end {fragment_end}'
-
-            fragment_numel = fragment_end - fragment_start
-            hp_frag_address = fragment_address(start=fragment_start - hp_start,
-                                               numel=fragment_numel)
-            hp_fragment_tensor = flat_hp_partition.narrow(0,
-                                                          hp_frag_address.start,
-                                                          hp_frag_address.numel)
-
-            optim_fragment = {
-                key: value.narrow(0,
-                                  hp_frag_address.start,
-                                  hp_frag_address.numel)
-                for key,
-                value in self.optimizer.state[flat_hp_partition].items()
-                if torch.is_tensor(value)
-            }
-
-            lp_frag_address = fragment_address(start=fragment_start - lp_start,
-                                               numel=fragment_numel)
-            lp_fragment_tensor = lp_param.flatten().narrow(0,
-                                                           lp_frag_address.start,
-                                                           lp_frag_address.numel)
-
-            lp_param._hp_mapping = tensor_fragment(lp_fragment=lp_fragment_tensor,
-                                                   lp_fragment_address=lp_frag_address,
-                                                   hp_fragment=hp_fragment_tensor,
-                                                   hp_fragment_address=hp_frag_address,
-                                                   optim_fragment=optim_fragment)
+            flat_hp_partition = self.fp32_groups_flat_partition[i]
+            link_hp_params(
+                lp_param_list=self.bf16_groups[i],
+                flat_hp_partition=flat_hp_partition,
+                gradient_dict=self.fp32_groups_gradient_dict,
+                offload_gradient_dict=None,
+                use_offload=False,
+                param_group_index=i,
+                partition_start=partition_id * partition_size,
+                partition_size=partition_size,
+                partition_optimizer_state=self.optimizer.state[flat_hp_partition],
+                dp_group=self.real_dp_process_group[i])
 
     def initialize_optimizer_states(self):
         """Take an optimizer step with zero-valued gradients to allocate internal
@@ -356,11 +280,6 @@ class BF16_Optimizer:
 
         self.update_lp_params()
 
-        all_gather_dp_groups(partitioned_param_groups=self.bf16_partitioned_groups,
-                             dp_process_group=self.real_dp_process_group,
-                             start_alignment_factor=self.nccl_start_alignment_factor,
-                             allgather_bucket_size=self.allgather_bucket_size)
-
         self.clear_hp_grads()
         self.step_count += 1
 
@@ -427,6 +346,14 @@ class BF16_Optimizer:
         for i, (bf16_partitions, fp32_partition) in enumerate(zip(self.bf16_partitioned_groups, self.fp32_groups_flat_partition)):
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
             bf16_partitions[partition_id].data.copy_(fp32_partition.data)
+            # print_rank_0(f'update_lp_params {i=} {partition_id=}', force=True)
+            # if i == 0:
+            #     print_rank_0(f'{fp32_partition[:10]=}', force=True)
+
+        all_gather_dp_groups(partitioned_param_groups=self.bf16_partitioned_groups,
+                             dp_process_group=self.real_dp_process_group,
+                             start_alignment_factor=self.nccl_start_alignment_factor,
+                             allgather_bucket_size=self.allgather_bucket_size)
 
     def clear_hp_grads(self):
         for flat_gradients in self.fp32_groups_gradients_flat:
@@ -445,9 +372,10 @@ class BF16_Optimizer:
         state_dict[CLIP_GRAD] = self.clip_grad
         state_dict[BASE_OPTIMIZER_STATE] = self.optimizer.state_dict()
         state_dict[SINGLE_PARTITION_OF_FP32_GROUPS] = self.fp32_groups_flat_partition
-        state_dict[GROUPS_PADDING] = self.groups_padding
+        state_dict[GROUP_PADDINGS] = self.group_paddings
         state_dict[PARTITION_COUNT] = self.partition_count
         state_dict[DS_VERSION] = version
+        state_dict[PARAM_SLICE_MAPPINGS] = self._param_slice_mappings
 
         return state_dict
 
@@ -463,8 +391,23 @@ class BF16_Optimizer:
 
     def load_state_dict(self,
                         state_dict_list,
+                        checkpoint_folder,
                         load_optimizer_states=True,
                         load_from_fp32_weights=False):
+        if checkpoint_folder:
+            self._load_universal_checkpoint(checkpoint_folder,
+                                            load_optimizer_states,
+                                            load_from_fp32_weights)
+        else:
+            self._load_legacy_checkpoint(state_dict_list,
+                                         load_optimizer_states,
+                                         load_from_fp32_weights)
+
+    def _load_legacy_checkpoint(self,
+                                state_dict_list,
+                                load_optimizer_states=True,
+                                load_from_fp32_weights=False):
+
         dp_rank = dist.get_rank(group=self.dp_process_group)
         current_rank_sd = state_dict_list[dp_rank]
 
@@ -482,13 +425,35 @@ class BF16_Optimizer:
                 src_tensor = _get_padded_tensor(saved, current.numel())
                 current.data.copy_(src_tensor.data)
 
-        self._link_all_hp_params()
+        if load_optimizer_states:
+            self._link_all_hp_params()
+
+    def _load_universal_checkpoint(self,
+                                   checkpoint_folder,
+                                   load_optimizer_states,
+                                   load_from_fp32_weights):
+        self._load_hp_checkpoint_state(checkpoint_folder)
 
     @property
     def param_groups(self):
         """Forward the wrapped optimizer's parameters."""
         return self.optimizer.param_groups
 
+    def _load_hp_checkpoint_state(self, checkpoint_dir):
+        checkpoint_dir = os.path.join(checkpoint_dir, "zero")
+        tp_rank = bwc_tensor_model_parallel_rank(mpu=self.mpu)
+        tp_world_size = self.mpu.get_slice_parallel_world_size()
+
+        for i, _ in enumerate(self.optimizer.param_groups):
+            for lp in self.bf16_groups[i]:
+                if lp._hp_mapping is not None:
+                    #print(f"Loading {self.param_names[lp]} {tp_rank=} {tp_world_size=}")
+                    lp.load_hp_checkpoint_state(
+                        os.path.join(checkpoint_dir,
+                                     self.param_names[lp]),
+                        tp_rank,
+                        tp_world_size)
+
 
 def _get_padded_tensor(src_tensor, size):
     if src_tensor.numel() >= size:
@@ -497,86 +462,3 @@ def _get_padded_tensor(src_tensor, size):
     slice_tensor = torch.narrow(padded_tensor, 0, 0, src_tensor.numel())
     slice_tensor.data.copy_(src_tensor.data)
     return padded_tensor
-
-
-'''
-Logic for lp_param to hp_param mapping
-
-lp      lp0 lp1 lp2         lp3  lp4            <-------  indices/names
-lp      [  ][  ][          ][   ][         ]    <-------- tensors
-flat_lp [                                  ]     <-------- flat lp params
-flat_hp            [                 ]   <------------------ flat hp partition on current rank
-full_hp [                                        ] <------- full flat hp params
-
-
-lp2
- full numel = 16
- lp_frag
-   numel = 12
-   frag_start = 3
-   frag_end  = 15
- hp_frag
-    numel = 12
-    frag_start = 0
-    frag_end = 11
-
- hp_frag.copy_(lp_frag)
-
-
-lp3:
-  full numel = 4
-  lp_frag
-     numel = 4
-     start = 0
-     end = 3
-  hp_frag
-     numel = 4
-     start = 12
-     end = 15
-
-
-lp4:
-   full numel = 12
-   lp_frag
-     numel = 4
-     start = 0
-     end = 3
-  hp_frag
-     numel = 4
-     start = 16
-     end = 19
-
-
-
-Visual depiction of above
-lp              {         }
-flat_lp [                                ]
-flat_hp            (                 )
-
-
-flat_lp [       {  (      }          )   ]
-                lx  hx   ly          hy
-                    ly-hx
-
-
-lp                             {       }
-flat_lp [                                ]
-flat_hp            (                 )
-
-
-flat_lp [          (            {     ) }  ]
-                   hx           lx   hy ly
-                                   hy-lx
-
-lp                        {   }
-flat_lp [                                ]
-flat_hp            (                 )
-
-
-flat_lp [          (       {   }      )   ]
-                   hx      lx  ly    hy
-                             ly-lx
-
-lp -> (lx, hy)
-flat_hp -> (hx, hy)
-'''
diff --git a/deepspeed/runtime/checkpoint_engine/README.md b/deepspeed/runtime/checkpoint_engine/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a19f54889802fa77fbf521456c8b4ac1f2ba8287
--- /dev/null
+++ b/deepspeed/runtime/checkpoint_engine/README.md
@@ -0,0 +1,37 @@
+# Checkpoint Engine
+
+
+The `CheckpointEngine` was designed to modularized the checkpoint serialization. In this way, we can simply replace/refine the checkpoint serialization methods.
+
+### Interface for `CheckpointEngine`
+
+Basically, for checkpoint management(save/load by deepspeed with the given tag), the `CheckpointEngine` will:
+
+	1. To make preliminaries ready by call `create(tag)`. For `torch`, we can just log some extra info as `torch` can directly call `save/load` without other preparation.
+
+	2. After the `create(tag)`, deepspeed can call `save/load` to persist files into disk/memory/etc.
+
+	3. When all the files for a tag are ready, deepspeed engine will call `commit()` to tell the checkpoint engine current checkpoint is complete. For original torch, it also plays the role of logger.
+
+
+```python
+class CheckpointEngine(object):
+    # init checkpoint engine for save/load
+    def __init__(self, config_params=None):
+        pass
+
+    def create(self, tag):
+        # create checkpoint on give tag for save/load.
+        pass
+
+    def save(self, state_dict, path: str):
+        pass
+
+    def load(self, path: str, map_location=None):
+        pass
+
+    def commit(self, tag):
+        # to tell checkpoint services if all files are readys.
+        pass
+
+```
diff --git a/deepspeed/runtime/checkpoint_engine/__init__.py b/deepspeed/runtime/checkpoint_engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcb45ab2b68516814a4bfbffebf2e01cbfefd527
--- /dev/null
+++ b/deepspeed/runtime/checkpoint_engine/__init__.py
@@ -0,0 +1 @@
+'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/checkpoint_engine/checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/checkpoint_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb6decb8826eb22574971100d2ca015f504a90dc
--- /dev/null
+++ b/deepspeed/runtime/checkpoint_engine/checkpoint_engine.py
@@ -0,0 +1,22 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+
+class CheckpointEngine(object):
+
+    # init checkpoint engine for save/load
+    def __init__(self, config_params=None):
+        pass
+
+    def create(self, tag):
+        # create checkpoint on give tag for save/load.
+        pass
+
+    def save(self, state_dict, path: str):
+        pass
+
+    def load(self, path: str, map_location=None):
+        pass
+
+    def commit(self, tag):
+        # to tell checkpoint services if all files are readys.
+        pass
diff --git a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..42bb067d087dfa863d9a904b6e62a0dfe32257dc
--- /dev/null
+++ b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py
@@ -0,0 +1,115 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
+import torch
+import torch_nebula
+
+from deepspeed.runtime.checkpoint_engine.checkpoint_engine import \
+    CheckpointEngine
+from deepspeed.utils import logger, log_dist
+from deepspeed.nebula.constants import *
+
+
+def _get_tag_from_path(path):
+    return os.path.basename(os.path.dirname(path))
+
+
+class NebulaCheckpointEngine(CheckpointEngine):
+    def __init__(self, config_params=None):
+        super().__init__(config_params)
+        self.checkpoint = None
+        self.tag_flag = None
+        self.enable_nebula_load = config_params.enable_nebula_load
+        self.nebula_load_path = config_params.load_path
+        if self.nebula_load_path is None:
+            self.nebula_load_path = config_params.persistent_storage_path
+
+        nebula_config_params = {
+            NEBULA_PERSISTENT_STORAGE_PATH: config_params.persistent_storage_path,
+            NEBULA_PERSISTENT_TIME_INTERVAL: config_params.persistent_time_interval,
+            NEBULA_NUM_OF_VERSION_IN_RETENTION:
+            config_params.num_of_version_in_retention,
+        }
+        torch_nebula.init(**nebula_config_params)
+
+    def create(self, tag):
+        log_dist(f"[Nebula] Start Checkpoint for tag:{tag}", ranks=[0])
+        # -2 means: customer needs to  explicitly tell nebula
+        # current checkpoint is complete by commit methond.
+        self.checkpoint = torch_nebula.Checkpoint(tag, -2)
+
+    def save(self, state_dict, path: str):
+        log_dist(f"[Nebula] Create dummy files for loading.")
+        torch.save("", path)
+
+        tag = _get_tag_from_path(path)
+        partititon_name = os.path.basename(path)
+        logger.info(f"[Nebula] Saving {partititon_name} under tag {tag}...")
+        self.checkpoint.save(partititon_name, state_dict)
+        logger.info(f"[Nebula] Saved {partititon_name} under tag {tag}.")
+        return None
+
+    def load(self, path: str, map_location=None):
+        tag = _get_tag_from_path(path)
+        first_load_flag = self.tag_flag is None or self.tag_flag == tag
+        if not self.enable_nebula_load and first_load_flag:
+            self.tag_flag = tag
+            logger.info(
+                f"[Nebula] Disable nebula load. Loading checkpoint from {path} ...")
+            partition = torch.load(path, map_location=map_location)
+            logger.info(f"[Nebula] Disable nebula load. Loaded checkpoint from {path} .")
+            return partition
+
+        partititon_name = os.path.basename(path)
+        logger.info(
+            f"[Nebula] Loading {path} under tag {tag} from nebula path {self.nebula_load_path}..."
+        )
+
+        checkpoint = None
+        if tag in (None, 'latest', 'latest_universal'):
+            # In some cases, there is the inconsistent tag between deepspeed metadata (latest file)
+            # and nebula metadata, will lead to the failure on loading with deepspeed tag. Then we
+            # will try to load the valid latest checkpoint from nebula(tier3 > tier1). So, in summary
+            # when met failure loading for given tag, the loading priority would be like:
+            #               nebula tier3 latest > nebula tier1 latest.
+            checkpoint = torch_nebula.get_latest_checkpoint(
+                persist_path=self.nebula_load_path)
+        else:
+            checkpoint = torch_nebula.get_checkpoint(tag=tag,
+                                                     persist_path=self.nebula_load_path)
+
+        if checkpoint is None or (checkpoint is not None and checkpoint.tag == ''):
+            logger.info(
+                f"Unable to find valid checkpoint tag:{tag} from Nebula, try to get latest checkpoint again from nebula {self.nebula_load_path} path!"
+            )
+            # nebula tier3 latest
+            checkpoint = torch_nebula.get_latest_checkpoint(
+                persist_path=self.nebula_load_path)
+            if checkpoint is None or (checkpoint is not None and checkpoint.tag == ''):
+                logger.info(
+                    f"Unable to find latest checkpoint from Nebula tier3, try to get latest checkpoint again from nebula tier1 path!"
+                )
+                # nebula tier1 latest
+                checkpoint = torch_nebula.get_latest_checkpoint()
+                logger.warning(
+                    f"Unable to find valid checkpoint from Nebula under tag:{tag}.")
+                return None
+
+        tag = checkpoint.tag
+        self.tag_flag = -1
+        partition = checkpoint.load(partititon_name, map_location=map_location)
+        logger.info(
+            f"[Nebula] Loaded {path} under tag {tag} from {self.nebula_load_path}.")
+        return partition
+
+    def commit(self, tag):
+        # nebula commit will be call when all files under give tag are ready to be persisted in the async way.
+        logger.info(
+            f"[Nebula] all files for {tag} are saved in tier1. It is ready to start persisting"
+        )
+        commit_rls = self.checkpoint.commit()
+        if not commit_rls:
+            logger.error(
+                f"[Nebula] failed to commit the checkpoint, please check the log.")
+            return False
+        return commit_rls
diff --git a/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ee131800defea9bba99f7bdd7c499d838ee54d6
--- /dev/null
+++ b/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
@@ -0,0 +1,30 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from deepspeed.utils import logger, log_dist
+from deepspeed.runtime.checkpoint_engine.checkpoint_engine import \
+    CheckpointEngine
+
+
+class TorchCheckpointEngine(CheckpointEngine):
+    def __init__(self, config_params=None):
+        super().__init__(config_params)
+
+    def create(self, tag):
+        log_dist(f"[Torch] Checkpoint {tag} is begin to save!", ranks=[0])
+
+    def save(self, state_dict, path: str):
+        logger.info(f"[Torch] Saving {path}...")
+        torch.save(state_dict, path)
+        logger.info(f"[Torch] Saved {path}.")
+        return None
+
+    def load(self, path: str, map_location=None):
+        logger.info(f"[Torch] Loading checkpoint from {path}...")
+        partition = torch.load(path, map_location=map_location)
+        logger.info(f"[Torch] Loaded checkpoint from {path}.")
+        return partition
+
+    def commit(self, tag):
+        logger.info(f"[Torch] Checkpoint {tag} is ready now!")
+        return True
diff --git a/deepspeed/runtime/comm/__init__.py b/deepspeed/runtime/comm/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..fcb45ab2b68516814a4bfbffebf2e01cbfefd527 100644
--- a/deepspeed/runtime/comm/__init__.py
+++ b/deepspeed/runtime/comm/__init__.py
@@ -0,0 +1 @@
+'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/comm/coalesced_collectives.py b/deepspeed/runtime/comm/coalesced_collectives.py
index 3b42d223182d357f5f188ba54f0d5732abec55eb..07302b1b7ae217e294dd5bb71221d0ba99bfac2f 100644
--- a/deepspeed/runtime/comm/coalesced_collectives.py
+++ b/deepspeed/runtime/comm/coalesced_collectives.py
@@ -1,3 +1,4 @@
+'''Copyright The Microsoft DeepSpeed Team'''
 """batched collective operations for overhead amortization and better
 bandwidth utilization"""
 
@@ -6,36 +7,23 @@ from typing import List
 
 import torch
 from torch import Tensor
-import torch.distributed
+from deepspeed import comm as dist
+# NOTE: Use torch.distributed's ProcessGroup class until we have our own.
 from torch.distributed import ProcessGroup
 import torch.nn.functional
 
 from deepspeed.utils import instrument_w_nvtx
-from deepspeed.utils.logging import logger
-
-if hasattr(torch.distributed, "_reduce_scatter_base"):
-
-    def torch_reduce_scatter_fn(input_tensor: Tensor, output_tensor: Tensor, group):
-        instrument_w_nvtx(torch.distributed._reduce_scatter_base)(
-            output_tensor,
-            input_tensor,
-            group=group,
-        )
-else:
-    logger.warning(
-        "unable to find torch.distributed._reduce_scatter_base. will fall back to "
-        "torch.distributed.reduce_scatter which will result in suboptimal performance. "
-        "please consider upgrading your pytorch installation.")
-
-    def torch_reduce_scatter_fn(input_tensor: Tensor, output_tensor: Tensor, group):
-        input_tensor_lst = list(
-            torch.chunk(input_tensor,
-                        torch.distributed.get_world_size(group)))
-        instrument_w_nvtx(torch.distributed.reduce_scatter)(
-            output_tensor,
-            input_tensor_lst,
-            group=group,
-        )
+
+
+def _torch_reduce_scatter_fn(input_tensor: Tensor,
+                             output_tensor: Tensor,
+                             group=None,
+                             async_op=False,
+                             prof=False):
+    return instrument_w_nvtx(dist.reduce_scatter_fn)(output_tensor,
+                                                     input_tensor,
+                                                     group=group,
+                                                     async_op=async_op)
 
 
 @instrument_w_nvtx
@@ -49,8 +37,8 @@ def reduce_scatter_coalesced(
 
     TODO. see if PyTorch team wants a c++ version of this for ProcessGroupNCCL
     """
-    this_rank = torch.distributed.get_rank(group)
-    world_sz = torch.distributed.get_world_size(group)
+    this_rank = dist.get_rank(group)
+    world_sz = dist.get_world_size(group)
 
     partition_lst_for_each_tensor = [None] * len(tensors)
     for tensor_idx, tensor in enumerate(tensors):
@@ -97,9 +85,9 @@ def reduce_scatter_coalesced(
         world_sz)
 
     # batched reduce-scatter call
-    torch_reduce_scatter_fn(tensor_partition_flat_buffer,
-                            tensor_partition_buffer_for_each_rank[this_rank],
-                            group)
+    _torch_reduce_scatter_fn(tensor_partition_flat_buffer,
+                             tensor_partition_buffer_for_each_rank[this_rank],
+                             group=group)
 
     # reverse procedure of the interleaving done previously, done on the
     # result of the batched reduce-scatter
diff --git a/deepspeed/runtime/comm/nccl.py b/deepspeed/runtime/comm/nccl.py
index fcab478a3ca845d30b17e485866b1cfc6ef4566e..fd99068dff82ba114e7c9034d6583b95f55614c2 100644
--- a/deepspeed/runtime/comm/nccl.py
+++ b/deepspeed/runtime/comm/nccl.py
@@ -3,12 +3,12 @@ Copyright 2020 The Microsoft DeepSpeed Team
 '''
 
 import torch
-import torch.distributed as dist
-import time
+from deepspeed import comm as dist
 import cupy
 import numpy as np
 
 from deepspeed.runtime.compression.cupy import CupyBackend
+from deepspeed.accelerator import get_accelerator
 
 
 class NcclBackend(object):
@@ -69,7 +69,7 @@ class NcclBackend(object):
             buffer_m = torch.cat([buffer_m, empty_tensor])
 
         buffer_m.add_(worker_error)
-        worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
+        worker_scale = torch.norm(buffer_m) / np.sqrt(buffer_m.numel())
         worker_error.set_(buffer_m - worker_scale *
                           buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
 
@@ -101,7 +101,8 @@ class NcclBackend(object):
         recvbuf_scale = [
             torch.zeros(1,
                         dtype=worker_scale.dtype,
-                        device=torch.device(local_rank)) for i in range(self.size)
+                        device=torch.device(get_accelerator().device_name(local_rank)))
+            for i in range(self.size)
         ]
 
         # communication phase 1
diff --git a/deepspeed/runtime/compression/__init__.py b/deepspeed/runtime/compression/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..fcb45ab2b68516814a4bfbffebf2e01cbfefd527 100644
--- a/deepspeed/runtime/compression/__init__.py
+++ b/deepspeed/runtime/compression/__init__.py
@@ -0,0 +1 @@
+'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
old mode 100644
new mode 100755
index 1df5912ef172179aee48f02b84b1c0d74dd17ebf..9da1058f2c8e91ea37873d378f4259cfa33780b8
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -1,3 +1,4 @@
+'''Copyright The Microsoft DeepSpeed Team'''
 """
 Copyright (c) Microsoft Corporation
 Licensed under the MIT license.
@@ -7,7 +8,9 @@ from typing import Union
 
 import torch
 import json
+import hjson
 import copy
+import base64
 
 from .constants import *
 from .fp16.loss_scaler import (
@@ -21,9 +24,12 @@ from .config_utils import (
     dict_raise_error_on_duplicate_keys,
     ScientificNotationEncoder,
 )
-from .zero.config import DeepSpeedZeroConfig
-from .zero.constants import *
+from .zero.config import get_zero_config, ZeroStageEnum
 from .activation_checkpointing.config import DeepSpeedActivationCheckpointingConfig
+from ..comm.config import DeepSpeedCommsConfig
+from ..monitor.config import get_monitor_config
+
+from deepspeed import comm as dist
 
 from ..git_version_info import version as __version__
 from ..utils import logger
@@ -38,13 +44,23 @@ from ..elasticity.constants import (
     ELASTICITY,
     IGNORE_NON_ELASTIC_BATCH_INFO,
     IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT,
+    MODEL_PARLLEL_SIZE,
+    MODEL_PARLLEL_SIZE_DEFAULT,
+    NUM_GPUS_PER_NODE,
+    NUM_GPUS_PER_NODE_DEFAULT,
 )
 
 from ..profiling.config import DeepSpeedFlopsProfilerConfig
 from ..autotuning.config import DeepSpeedAutotuningConfig
+from ..nebula.config import DeepSpeedNebulaConfig
 
+from ..compression.config import get_compression_config, get_quantize_enabled
+from ..compression.constants import *
 from .swap_tensor.aio_config import get_aio_config
 
+from .data_pipeline.config import get_data_efficiency_enabled, get_data_efficiency_config, get_curriculum_enabled_legacy, get_curriculum_params_legacy
+from .data_pipeline.constants import *
+
 TENSOR_CORE_ALIGN_SIZE = 8
 
 ADAGRAD_OPTIMIZER = 'adagrad'
@@ -76,24 +92,6 @@ class DeepSpeedConfigError(Exception):
     pass
 
 
-def get_curriculum_enabled(param_dict):
-    if CURRICULUM_LEARNING in param_dict.keys():
-        return get_scalar_param(param_dict[CURRICULUM_LEARNING],
-                                CURRICULUM_ENABLED,
-                                CURRICULUM_ENABLED_DEFAULT)
-    else:
-        return False
-
-
-def get_curriculum_params(param_dict):
-    if CURRICULUM_LEARNING in param_dict.keys():
-        curriculum_params = copy.copy(param_dict[CURRICULUM_LEARNING])
-        curriculum_params.pop(CURRICULUM_ENABLED)
-        return curriculum_params
-    else:
-        return False
-
-
 def get_pld_enabled(param_dict):
     if PROGRESSIVE_LAYER_DROP in param_dict.keys():
         return get_scalar_param(param_dict[PROGRESSIVE_LAYER_DROP],
@@ -153,6 +151,11 @@ def get_fp16_master_weights_and_grads_enabled(param_dict):
         return False
 
 
+def get_fp16_auto_cast(param_dict):
+    if get_fp16_enabled(param_dict):
+        return get_scalar_param(param_dict[FP16], FP16_AUTO_CAST, FP16_AUTO_CAST_DEFAULT)
+
+
 def get_loss_scale(param_dict):
     if get_fp16_enabled(param_dict):
         return get_scalar_param(param_dict[FP16],
@@ -220,18 +223,6 @@ def get_sparse_gradients_enabled(param_dict):
     return get_scalar_param(param_dict, SPARSE_GRADIENTS, SPARSE_GRADIENTS_DEFAULT)
 
 
-def get_zero_optimization(param_dict):
-    return get_scalar_param(param_dict, ZERO_OPTIMIZATION, ZERO_OPTIMIZATION_DEFAULT)
-
-
-def get_zero_reduce_scatter(param_dict):
-    return get_scalar_param(
-        param_dict,
-        ZERO_OPTIMIZATION_REDUCE_SCATTER,
-        ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT,
-    )
-
-
 def get_communication_data_type(param_dict):
     val = get_scalar_param(param_dict,
                            COMMUNICATION_DATA_TYPE,
@@ -261,73 +252,6 @@ def get_gradient_predivide_factor(param_dict):
                             GRADIENT_PREDIVIDE_FACTOR_DEFAULT)
 
 
-def get_quantize_enabled(param_dict):
-    if QUANTIZE_TRAINING in param_dict.keys():
-        return get_scalar_param(
-            param_dict[QUANTIZE_TRAINING],
-            QUANTIZE_TRAINING_ENABLED,
-            QUANTIZE_TRAINING_ENABLED_DEFAULT,
-        )
-    else:
-        return False
-
-
-def get_quantize_training(param_dict):
-    if QUANTIZE_TRAINING in param_dict.keys():
-        return (
-            (param_dict[QUANTIZE_TRAINING][QUANTIZE_BITS][TARGET_BITS]),
-            (param_dict[QUANTIZE_TRAINING][QUANTIZE_BITS][START_BITS]
-             if START_BITS in param_dict[QUANTIZE_TRAINING][QUANTIZE_BITS].keys() else
-             QUANTIZE_START_BITS_DEFAULT),
-            (param_dict[QUANTIZE_TRAINING][QUANTIZE_SCHEDULE][QUANTIZE_PERIOD]
-             if QUANTIZE_SCHEDULE in param_dict[QUANTIZE_TRAINING].keys() else
-             QUANTIZE_PERIOD_DEFAULT),
-            (param_dict[QUANTIZE_TRAINING][QUANTIZE_SCHEDULE][SCHEDULE_OFFSET]
-             if QUANTIZE_SCHEDULE in param_dict[QUANTIZE_TRAINING].keys() and
-             SCHEDULE_OFFSET in param_dict[QUANTIZE_TRAINING][QUANTIZE_SCHEDULE].keys()
-             else QUANTIZE_OFFSET_DEFAULT),
-            (param_dict[QUANTIZE_TRAINING][QUANTIZE_GROUPS] if QUANTIZE_GROUPS
-             in param_dict[QUANTIZE_TRAINING].keys() else QUANTIZE_GROUPS_DEFAULT),
-            (param_dict[QUANTIZE_TRAINING][FP16_MIXED_QUANTIZE]
-             [FP16_MIXED_QUANTIZE_ENABLED]
-             if FP16_MIXED_QUANTIZE in param_dict[QUANTIZE_TRAINING].keys()
-             and FP16_MIXED_QUANTIZE_ENABLED
-             in param_dict[QUANTIZE_TRAINING][FP16_MIXED_QUANTIZE].keys() else
-             FP16_MIXED_QUANTIZE_ENABLED_DEFAULT),
-            (param_dict[QUANTIZE_TRAINING][FP16_MIXED_QUANTIZE][QUANTIZE_CHANGE_RATIO]
-             if FP16_MIXED_QUANTIZE in param_dict[QUANTIZE_TRAINING].keys()
-             and QUANTIZE_CHANGE_RATIO
-             in param_dict[QUANTIZE_TRAINING][FP16_MIXED_QUANTIZE].keys() else
-             QUANTIZE_CHANGE_RATIO_DEFAULT),
-            (1 if QUANTIZE_ALGO in param_dict[QUANTIZE_TRAINING]
-             and QUANTIZE_TYPE in param_dict[QUANTIZE_TRAINING][QUANTIZE_ALGO].keys()
-             and param_dict[QUANTIZE_TRAINING][QUANTIZE_ALGO][QUANTIZE_TYPE]
-             == QUANTIZE_ASYMMETRIC else QUANTIZE_TYPE_DEFAULT),
-            (1 if QUANTIZE_ALGO in param_dict[QUANTIZE_TRAINING] and QUANTIZE_ROUNDING
-             in param_dict[QUANTIZE_TRAINING][QUANTIZE_ALGO].keys()
-             and param_dict[QUANTIZE_TRAINING][QUANTIZE_ALGO][QUANTIZE_ROUNDING]
-             == STOCHASTIC_ROUNDING else QUANTIZE_ROUNDING_DEFAULT),
-            (param_dict[QUANTIZE_TRAINING][QUANTIZE_VERBOSE] if QUANTIZE_VERBOSE
-             in param_dict[QUANTIZE_TRAINING].keys() else QUANTIZE_VERBOSE_DEFAULT),
-            (param_dict[QUANTIZE_TRAINING][QUANTIZER_KERNEL] if QUANTIZER_KERNEL
-             in param_dict[QUANTIZE_TRAINING].keys() else QUANTIZER_KERNEL_DEFAULT),
-        )
-    else:
-        return (
-            QUANTIZE_TARGET_BITS_DEFAULT,
-            QUANTIZE_START_BITS_DEFAULT,
-            QUANTIZE_PERIOD_DEFAULT,
-            QUANTIZE_OFFSET_DEFAULT,
-            QUANTIZE_GROUPS_DEFAULT,
-            FP16_MIXED_QUANTIZE_ENABLED_DEFAULT,
-            QUANTIZE_CHANGE_RATIO_DEFAULT,
-            QUANTIZE_TYPE_DEFAULT,
-            QUANTIZE_ROUNDING_DEFAULT,
-            QUANTIZE_VERBOSE_DEFAULT,
-            QUANTIZER_KERNEL_DEFAULT,
-        )
-
-
 def get_steps_per_print(param_dict):
     return get_scalar_param(param_dict, STEPS_PER_PRINT, STEPS_PER_PRINT_DEFAULT)
 
@@ -615,18 +539,10 @@ def get_memory_breakdown(param_dict):
     return get_scalar_param(param_dict, MEMORY_BREAKDOWN, MEMORY_BREAKDOWN_DEFAULT)
 
 
-def get_tensorboard_enabled(param_dict):
-    if TENSORBOARD in param_dict.keys():
-        return get_scalar_param(param_dict[TENSORBOARD],
-                                TENSORBOARD_ENABLED,
-                                TENSORBOARD_ENABLED_DEFAULT)
-    else:
-        return False
-
-
 def get_eigenvalue_config(param_dict):
     if get_quantize_enabled(param_dict):
         param_dict = param_dict[QUANTIZE_TRAINING]
+        assert not get_eigenvalue_enabled(param_dict), "Eigenvalue based MoQ is temporarily disabled"
         return (
             get_eigenvalue_enabled(param_dict),
             get_eigenvalue_verbose(param_dict),
@@ -724,30 +640,14 @@ def get_eigenvalue_layer_num(param_dict):
         return EIGENVALUE_LAYER_NUM_DEFAULT
 
 
-def get_tensorboard_output_path(param_dict):
-    if get_tensorboard_enabled(param_dict):
-        return get_scalar_param(
-            param_dict[TENSORBOARD],
-            TENSORBOARD_OUTPUT_PATH,
-            TENSORBOARD_OUTPUT_PATH_DEFAULT,
-        )
-    else:
-        return TENSORBOARD_OUTPUT_PATH_DEFAULT
-
-
-def get_tensorboard_job_name(param_dict):
-    if get_tensorboard_enabled(param_dict):
-        return get_scalar_param(param_dict[TENSORBOARD],
-                                TENSORBOARD_JOB_NAME,
-                                TENSORBOARD_JOB_NAME_DEFAULT)
-    else:
-        return TENSORBOARD_JOB_NAME_DEFAULT
-
-
 def get_checkpoint_params(param_dict):
     return param_dict.get(CHECKPOINT, {})
 
 
+def get_data_types_params(param_dict):
+    return param_dict.get(DATA_TYPES, {})
+
+
 def get_checkpoint_tag_validation_mode(checkpoint_params):
     tag_validation_mode = checkpoint_params.get(CHECKPOINT_TAG_VALIDATION,
                                                 CHECKPOINT_TAG_VALIDATION_DEFAULT)
@@ -761,6 +661,19 @@ def get_checkpoint_tag_validation_mode(checkpoint_params):
         )
 
 
+def get_checkpoint_parallel_write_pipeline(checkpoint_params):
+    par_write_params = checkpoint_params.get(CHECKPOINT_PARALLEL_WRITE, {})
+    par_write_pipeline = par_write_params.get(
+        CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE,
+        CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE_DEFAULT)
+    if par_write_pipeline in [True, False]:
+        return par_write_pipeline
+    else:
+        raise DeepSpeedConfigError(
+            "checkpoint::parallel_write::pipeline_stage "
+            f"value of '{par_write_pipeline}' is invalid, expecting: true or false")
+
+
 def get_dataloader_drop_last(param_dict):
     return get_scalar_param(param_dict,
                             DATALOADER_DROP_LAST,
@@ -794,18 +707,22 @@ class DeepSpeedConfig(object):
         if isinstance(config, dict):
             self._param_dict = config
         elif os.path.exists(config):
-            self._param_dict = json.load(
+            self._param_dict = hjson.load(
                 open(config,
                      "r"),
                 object_pairs_hook=dict_raise_error_on_duplicate_keys)
         else:
-            raise ValueError(
-                f"Expected a string path to an existing deepspeed config, or a dictionary. Received: {config}"
-            )
+            try:
+                config_decoded = base64.urlsafe_b64decode(config).decode('utf-8')
+                self._param_dict = hjson.loads(config_decoded)
+            except (UnicodeDecodeError, AttributeError):
+                raise ValueError(
+                    f"Expected a string path to an existing deepspeed config, or a dictionary or a valid base64. Received: {config}"
+                )
         try:
-            self.global_rank = torch.distributed.get_rank()
+            self.global_rank = dist.get_rank()
             if mpu is None:
-                self.world_size = torch.distributed.get_world_size()
+                self.world_size = dist.get_world_size()
             else:
                 self.world_size = mpu.get_data_parallel_world_size()
         except:
@@ -827,6 +744,21 @@ class DeepSpeedConfig(object):
             # Ensure the resource scheduler saw the same elastic config we are using at runtime
             ensure_immutable_elastic_config(runtime_elastic_config_dict=elastic_dict)
 
+            self.elastic_model_parallel_size = elastic_dict.get(
+                MODEL_PARLLEL_SIZE,
+                MODEL_PARLLEL_SIZE_DEFAULT)
+            if self.elastic_model_parallel_size < 1:
+                raise ElasticityConfigError(
+                    "Model-Parallel size cannot be less than 1, "
+                    f"given model-parallel size: {self.elastic_model_parallel_size}")
+
+            self.num_gpus_per_node = elastic_dict.get(NUM_GPUS_PER_NODE,
+                                                      NUM_GPUS_PER_NODE_DEFAULT)
+            if self.num_gpus_per_node < 1:
+                raise ElasticityConfigError(
+                    "NUmber of GPUs per node cannot be less than 1, "
+                    f"given number of GPUs per node: {self.num_gpus_per_node}")
+
             ignore_non_elastic_batch_info = elastic_dict.get(
                 IGNORE_NON_ELASTIC_BATCH_INFO,
                 IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT)
@@ -871,7 +803,8 @@ class DeepSpeedConfig(object):
             self._param_dict[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = micro_batch_size
             self._param_dict[GRADIENT_ACCUMULATION_STEPS] = gradient_accu_steps
 
-        self._initialize_params(self._param_dict)
+        # Pass a copy so that user json is unmodified, e.g. for logging
+        self._initialize_params(copy.copy(self._param_dict))
         self._configure_train_batch_size()
         self._do_sanity_check()
 
@@ -890,15 +823,19 @@ class DeepSpeedConfig(object):
         self.gradient_predivide_factor = get_gradient_predivide_factor(param_dict)
         self.sparse_gradients_enabled = get_sparse_gradients_enabled(param_dict)
 
-        self.zero_config = DeepSpeedZeroConfig(param_dict)
+        self.zero_config = get_zero_config(param_dict)
         self.zero_optimization_stage = self.zero_config.stage
         self.zero_enabled = self.zero_optimization_stage > 0
 
         self.activation_checkpointing_config = DeepSpeedActivationCheckpointingConfig(
             param_dict)
 
+        self.comms_config = DeepSpeedCommsConfig(param_dict)
+        self.monitor_config = get_monitor_config(param_dict)
+
         self.gradient_clipping = get_gradient_clipping(param_dict)
         self.fp16_enabled = get_fp16_enabled(param_dict)
+        self.fp16_auto_cast = get_fp16_auto_cast(param_dict)
         self.bfloat16_enabled = get_bfloat16_enabled(param_dict)
         assert not (self.fp16_enabled and self.bfloat16_enabled), 'bfloat16 and fp16 modes cannot be simultaneously enabled'
         self.fp16_master_weights_and_gradients = get_fp16_master_weights_and_grads_enabled(
@@ -909,20 +846,7 @@ class DeepSpeedConfig(object):
         self.initial_dynamic_scale = get_initial_dynamic_scale(param_dict)
         self.dynamic_loss_scale_args = get_dynamic_loss_scale_args(param_dict)
 
-        self.quantize_training_enabled = get_quantize_enabled(param_dict)
-        (
-            self.quantize_target_bits,
-            self.quantize_start_bits,
-            self.quantize_period,
-            self.quantize_offset,
-            self.quantize_groups,
-            self.fp16_mixed_quantize,
-            self.quantize_change_rate,
-            self.quantize_type,
-            self.quantize_rounding,
-            self.quantize_verbose,
-            self.use_quantizer_kernel,
-        ) = get_quantize_training(param_dict)
+        self.compression_config = get_compression_config(param_dict)
 
         self.optimizer_name = get_optimizer_name(param_dict)
         if (self.optimizer_name is not None
@@ -943,9 +867,6 @@ class DeepSpeedConfig(object):
                                      | self.flops_profiler_config.enabled)
         self.memory_breakdown = get_memory_breakdown(param_dict)
         self.autotuning_config = DeepSpeedAutotuningConfig(param_dict)
-        self.tensorboard_enabled = get_tensorboard_enabled(param_dict)
-        self.tensorboard_output_path = get_tensorboard_output_path(param_dict)
-        self.tensorboard_job_name = get_tensorboard_job_name(param_dict)
 
         (
             self.eigenvalue_enabled,
@@ -964,19 +885,38 @@ class DeepSpeedConfig(object):
         self.pld_enabled = get_pld_enabled(param_dict)
         self.pld_params = get_pld_params(param_dict)
 
-        self.curriculum_enabled = get_curriculum_enabled(param_dict)
-        self.curriculum_params = get_curriculum_params(param_dict)
+        self.curriculum_enabled_legacy = get_curriculum_enabled_legacy(param_dict)
+        self.curriculum_params_legacy = get_curriculum_params_legacy(param_dict)
+
+        self.data_efficiency_enabled = get_data_efficiency_enabled(param_dict)
+        self.data_efficiency_config = get_data_efficiency_config(param_dict)
 
         checkpoint_params = get_checkpoint_params(param_dict)
         validation_mode = get_checkpoint_tag_validation_mode(checkpoint_params)
         self.checkpoint_tag_validation_enabled = (validation_mode !=
                                                   ValidationMode.IGNORE)
         self.checkpoint_tag_validation_fail = validation_mode == ValidationMode.FAIL
+        self.load_universal_checkpoint = checkpoint_params.get(
+            LOAD_UNIVERSAL_CHECKPOINT,
+            LOAD_UNIVERSAL_CHECKPOINT_DEFAULT)
+
+        self.use_node_local_storage = checkpoint_params.get(
+            USE_NODE_LOCAL_STORAGE_CHECKPOINT,
+            USE_NODE_LOCAL_STORAGE_CHECKPOINT_DEFAULT)
+
+        data_types_params = get_data_types_params(param_dict)
+        self.grad_accum_dtype = data_types_params.get(GRAD_ACCUM_DTYPE,
+                                                      GRAD_ACCUM_DTYPE_DEFAULT)
+
+        par_write_pipe = get_checkpoint_parallel_write_pipeline(checkpoint_params)
+        self.checkpoint_parallel_write_pipeline = par_write_pipe
 
         self.aio_config = get_aio_config(param_dict)
 
         self.dataloader_drop_last = get_dataloader_drop_last(param_dict)
 
+        self.nebula_config = DeepSpeedNebulaConfig(param_dict)
+
     def _batch_assertion(self):
 
         train_batch = self.train_batch_size
@@ -1055,13 +995,7 @@ class DeepSpeedConfig(object):
 
         self._do_warning_check()
 
-    def print(self, name):
-        logger.info("{}:".format(name))
-        for arg in sorted(vars(self)):
-            if arg != "_param_dict":
-                dots = "." * (29 - len(arg))
-                logger.info("  {} {} {}".format(arg, dots, getattr(self, arg)))
-
+    def print_user_config(self):
         logger.info("  json = {}".format(
             json.dumps(
                 self._param_dict,
@@ -1072,6 +1006,15 @@ class DeepSpeedConfig(object):
                             ":"),
             )))
 
+    def print(self, name):
+        logger.info("{}:".format(name))
+        for arg in sorted(vars(self)):
+            if arg != "_param_dict":
+                dots = "." * (29 - len(arg))
+                logger.info("  {} {} {}".format(arg, dots, getattr(self, arg)))
+
+        self.print_user_config()
+
     def _do_error_check(self):
         assert (
             self.train_micro_batch_size_per_gpu
@@ -1083,13 +1026,13 @@ class DeepSpeedConfig(object):
 
         if self.zero_enabled:
             assert (
-                self.zero_optimization_stage <= MAX_STAGE_ZERO_OPTIMIZATION
+                self.zero_optimization_stage <= ZeroStageEnum.max_stage
             ), "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(
-                MAX_STAGE_ZERO_OPTIMIZATION
+                ZeroStageEnum.max_stage
             )
 
         if self.fp16_master_weights_and_gradients:
-            assert self.zero_enabled and self.zero_optimization_stage == ZERO_OPTIMIZATION_GRADIENTS, "Fp16_master_weights_and_grads is only supported with ZeRO Stage 2 for now."
+            assert self.zero_enabled and self.zero_optimization_stage == ZeroStageEnum.gradients, "Fp16_master_weights_and_grads is only supported with ZeRO Stage 2 for now."
 
     def _do_warning_check(self):
         fp16_enabled = self.fp16_enabled
diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py
old mode 100644
new mode 100755
index 83c48bbee5cb698f348229fb4906ec70e824b89a..cd262f14a6dd4ae072def8a2ecdd4a3cbc190574
--- a/deepspeed/runtime/config_utils.py
+++ b/deepspeed/runtime/config_utils.py
@@ -1,3 +1,4 @@
+'''Copyright The Microsoft DeepSpeed Team'''
 """
 Copyright (c) Microsoft Corporation
 Licensed under the MIT license.
@@ -8,6 +9,133 @@ Collection of DeepSpeed configuration utilities
 import json
 import collections
 import collections.abc
+from functools import reduce
+from pydantic import BaseModel
+from deepspeed.utils import logger
+
+
+class DeepSpeedConfigModel(BaseModel):
+    """
+    This class should be used as a base for all DeepSpeed configs. It extends
+    pydantic.BaseModel to allow for deprecated fields. To enable this feature,
+    add deprecated=True to pydantic.Field:
+
+    my_dep_field: int = Field(0, deprecated=True)
+
+    Deprecated Field kwargs:
+    - deprecated: [True|False], default False
+        Enables / Disables deprecated fields
+    - deprecated_msg: str, default ""
+        Message to include with deprecation warning
+    - new_param: str, default ""
+        Name of the field replacing the deprecated field
+    - set_new_param: [True|False], default True
+        If new_param is provided, enables setting the value of that param with
+        deprecated field value
+    - new_param_fn: callable, default (lambda x: x)
+        If new_param is provided and set_new_param is True, this function will
+        modify the value of the deprecated field before placing that value in
+        the new_param field
+
+    Example:
+        my_new_field is replacing a deprecated my_old_field. The expected type
+        for my_new_field is int while the expected type for my_old_field is
+        str. We want to maintain backward compatibility with our configs, so we
+        define the fields with:
+
+        class MyExampleConfig(DeepSpeedConfigModel):
+            my_new_field: int = 0
+            my_old_field: str = Field('0',
+                                      deprecated=True,
+                                      new_param='my_new_field',
+                                      new_param_fn=(lambda x: int(x)))
+    """
+    def __init__(self, strict=False, **data):
+        if (
+                not strict
+        ):  # This is temporary until we refactor all DS configs, allows HF to load models
+            data = {
+                k: v
+                for k,
+                v in data.items() if (v != "auto" or k == "replace_method")
+            }
+        super().__init__(**data)
+        self._deprecated_fields_check(self)
+
+    def _process_deprecated_field(self, pydantic_config, field):
+        # Get information about the deprecated field
+        fields_set = pydantic_config.__fields_set__
+        dep_param = field.name
+        kwargs = field.field_info.extra
+        new_param_fn = kwargs.get("new_param_fn", lambda x: x)
+        param_value = new_param_fn(getattr(pydantic_config, dep_param))
+        new_param = kwargs.get("new_param", "")
+        dep_msg = kwargs.get("deprecated_msg", "")
+        if dep_param in fields_set:
+            logger.warning(f"Config parameter {dep_param} is deprecated" +
+                           (f" use {new_param} instead" if new_param else "") +
+                           (f". {dep_msg}" if dep_msg else ""))
+            # Check if there is a new param and if it should be set with a value
+            if new_param and kwargs.get("set_new_param", True):
+                # Remove the deprecate field if there is a replacing field
+                try:
+                    delattr(pydantic_config, dep_param)
+                except Exception as e:
+                    logger.error(f"Tried removing deprecated '{dep_param}' from config")
+                    raise e
+
+                # Set new param value
+                new_param_nested = new_param.split(".")
+                if len(new_param_nested) > 1:
+                    # If the new param exists in a subconfig, we need to get
+                    # the fields set for that subconfig
+                    pydantic_config = reduce(getattr,
+                                             new_param_nested[:-1],
+                                             pydantic_config)
+                    fields_set = pydantic_config.__fields_set__
+                new_param_name = new_param_nested[-1]
+                assert (
+                    new_param_name not in fields_set
+                ), f"Cannot provide deprecated parameter '{dep_param}' and replacing parameter '{new_param}' together"
+                # A custom function for converting the old param value to new param value can be provided
+                try:
+                    setattr(pydantic_config, new_param_name, param_value)
+                except Exception as e:
+                    logger.error(
+                        f"Tried setting value for '{new_param}' with value from deprecated '{dep_param}'"
+                    )
+                    raise e
+
+    def _deprecated_fields_check(self, pydantic_config):
+        fields = pydantic_config.__fields__
+        for field in fields.values():
+            if field.field_info.extra.get("deprecated", False):
+                self._process_deprecated_field(pydantic_config, field)
+
+    class Config:
+        validate_all = True
+        validate_assignment = True
+        use_enum_values = True
+        allow_population_by_field_name = True
+        extra = "forbid"
+        arbitrary_types_allowed = True
+
+
+class pp_int(int):
+    """
+    A wrapper for integers that will return a custom string or comma-formatted
+    string of the integer. For example, print(pp_int(1e5)) will return
+    "10,000". This is useful mainly for auto-generated documentation purposes.
+    """
+    def __new__(cls, val, custom_print_str=None):
+        inst = super().__new__(cls, val)
+        inst.custom_print_str = custom_print_str
+        return inst
+
+    def __repr__(self):
+        if self.custom_print_str:
+            return self.custom_print_str
+        return f"{self.real:,}"
 
 
 # adapted from https://stackoverflow.com/a/50701137/9201239
@@ -37,7 +165,7 @@ class ScientificNotationEncoder(json.JSONEncoder):
                 f'\n{prefix}"{k}": {self.iterencode(v, level=level)}' for k,
                 v in o.items()
             ]
-            return "{" + ', '.join(x) + f"\n{prefix_close}" + "}"
+            return "{" + ", ".join(x) + f"\n{prefix_close}" + "}"
         elif isinstance(o, collections.abc.Sequence) and not isinstance(o, str):
             return f"[{ f', '.join(map(self.iterencode, o)) }]"
         return "\n, ".join(super().iterencode(o, _one_shot))
diff --git a/deepspeed/runtime/constants.py b/deepspeed/runtime/constants.py
old mode 100644
new mode 100755
index ee2e51c6109f8f038ea59491bd29e0e9ac664d47..6925745a8e5a022843ade33f3f19dd706e2b5771
--- a/deepspeed/runtime/constants.py
+++ b/deepspeed/runtime/constants.py
@@ -1,3 +1,4 @@
+'''Copyright The Microsoft DeepSpeed Team'''
 """
 Copyright (c) Microsoft Corporation
 Licensed under the MIT license.
@@ -133,8 +134,9 @@ FP16_FORMAT = '''
 FP16 parameters should be of the format:
 "fp16": {
   "enabled": true,
+  "auto_cast": false,
   "loss_scale": 0,
-  "initial_scale_power": 32,
+  "initial_scale_power": 16,
   "loss_scale_window": 1000,
   "hysteresis": 2,
   "min_loss_scale": 1
@@ -149,9 +151,12 @@ FP16_ENABLED_DEFAULT = False
 FP16_LOSS_SCALE = "loss_scale"
 FP16_LOSS_SCALE_DEFAULT = 0
 
+FP16_AUTO_CAST = "auto_cast"
+FP16_AUTO_CAST_DEFAULT = False
+
 # FP16 initial dynamic scale loss power
 FP16_INITIAL_SCALE_POWER = "initial_scale_power"
-FP16_INITIAL_SCALE_POWER_DEFAULT = 32
+FP16_INITIAL_SCALE_POWER_DEFAULT = 16
 
 # FP16 loss scale window
 FP16_LOSS_SCALE_WINDOW = "loss_scale_window"
@@ -282,33 +287,6 @@ WALL_CLOCK_BREAKDOWN_DEFAULT = False
 MEMORY_BREAKDOWN = 'memory_breakdown'
 MEMORY_BREAKDOWN_DEFAULT = False
 
-#########################################
-# Tensorboard
-#########################################
-# Tensorboard. By default, this feature is not enabled.
-# Users can configure in ds_config.json as below example:
-TENSORBOARD_FORMAT = '''
-Tensorboard can be specified as:
-"tensorboard": {
-  "enabled": true,
-  "output_path": "/home/myname/foo",
-  "job_name": "model_lr2e-5_epoch3_seed2_seq64"
-}
-'''
-TENSORBOARD = "tensorboard"
-
-# Tensorboard enable signal
-TENSORBOARD_ENABLED = "enabled"
-TENSORBOARD_ENABLED_DEFAULT = False
-
-# Tensorboard output path
-TENSORBOARD_OUTPUT_PATH = "output_path"
-TENSORBOARD_OUTPUT_PATH_DEFAULT = ""
-
-# Tensorboard job name
-TENSORBOARD_JOB_NAME = "job_name"
-TENSORBOARD_JOB_NAME_DEFAULT = "DeepSpeedJobName"
-
 #########################################
 # Eigenvalue
 #########################################
@@ -366,14 +344,6 @@ PLD_THETA_DEFAULT = 1.0
 PLD_GAMMA = "gamma"
 PLD_GAMMA_DEFAULT = 0.001
 
-#########################################
-# Curriculum Learning
-#########################################
-CURRICULUM_LEARNING = "curriculum_learning"
-
-CURRICULUM_ENABLED = "enabled"
-CURRICULUM_ENABLED_DEFAULT = False
-
 
 #########################################
 # Validation modes
@@ -387,7 +357,14 @@ class ValidationMode:
 #########################################
 # Checkpoint config params
 #########################################
-# "checkpoint": {tag_validation=["Ignore"|"Warn"|"Fail"]}
+# "checkpoint": {
+#   tag_validation=["Ignore"|"Warn"|"Fail"]
+#   load_universal=false
+#   use_node_local_storage=false
+#   parallel_write: {
+#     pipeline_stage: [True|False]
+#   }
+# }
 CHECKPOINT = "checkpoint"
 CHECKPOINT_TAG_VALIDATION = "tag_validation"
 CHECKPOINT_TAG_VALIDATION_DEFAULT = ValidationMode.WARN
@@ -397,43 +374,27 @@ CHECKPOINT_TAG_VALIDATION_MODES = [
     ValidationMode.FAIL
 ]
 
+LOAD_UNIVERSAL_CHECKPOINT = "load_universal"
+LOAD_UNIVERSAL_CHECKPOINT_DEFAULT = False
+
+USE_NODE_LOCAL_STORAGE_CHECKPOINT = "use_node_local_storage"
+USE_NODE_LOCAL_STORAGE_CHECKPOINT_DEFAULT = False
+
+CHECKPOINT_PARALLEL_WRITE = "parallel_write"
+CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE = "pipeline_stage"
+CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE_DEFAULT = False
+
+#########################################
+# Data types config params
 #########################################
-# Quantization
-#########################################
-QUANTIZE_TRAINING = "quantize_training"
-QUANTIZE_BITS = "quantize_bits"
-START_BITS = "start_bits"
-TARGET_BITS = "target_bits"
-QUANTIZER_KERNEL = "quantizer_kernel"
-QUANTIZE_SCHEDULE = "quantize_schedule"
-QUANTIZE_PERIOD = "quantize_period"
-SCHEDULE_OFFSET = "schedule_offset"
-QUANTIZE_GROUPS = "quantize_groups"
-FP16_MIXED_QUANTIZE = "fp16_mixed_quantize"
-QUANTIZE_CHANGE_RATIO = "quantize_change_ratio"
-FP16_MIXED_QUANTIZE_ENABLED = "enabled"
-QUANTIZE_VERBOSE = "quantize_verbose"
-QUANTIZE_ALGO = "quantize_algo"
-QUANTIZE_TYPE = "q_type"
-QUANTIZE_SYMMETRIC = "symmetric"
-QUANTIZE_ASYMMETRIC = "asymmetric"
-STOCHASTIC_ROUNDING = "stochastic"
-NEAREST_ROUNDING = "nearest"
-QUANTIZE_ROUNDING = "rounding"
-QUANTIZE_TRAINING_ENABLED = "enabled"
-QUANTIZE_TRAINING_ENABLED_DEFAULT = False
-QUANTIZE_TRAINING_DEFAULT = False
-QUANTIZE_START_BITS_DEFAULT = 16
-QUANTIZE_TARGET_BITS_DEFAULT = 8
-QUANTIZER_KERNEL_DEFAULT = False
-QUANTIZE_PERIOD_DEFAULT = 1000
-QUANTIZE_OFFSET_DEFAULT = 1000
-QUANTIZE_GROUPS_DEFAULT = 1
-QUANTIZE_TYPE_DEFAULT = 0  #symmetric
-QUANTIZE_ROUNDING_DEFAULT = 0  #nearest
-FP16_MIXED_QUANTIZE_ENABLED_DEFAULT = False
-QUANTIZE_CHANGE_RATIO_DEFAULT = 0.001
-QUANTIZE_VERBOSE_DEFAULT = False
+# "data_types": {
+#   grad_accum_dtype=["bf16"|"fp16"|"fp32"]
+#   }
+# }
+
+DATA_TYPES = "data_types"
+GRAD_ACCUM_DTYPE = "grad_accum_dtype"
+GRAD_ACCUM_DTYPE_DEFAULT = None
 
 #########################################
 # Drop the last incomplete Batch
@@ -451,3 +412,9 @@ DATALOADER_DROP_LAST_DEFAULT = False
 # PIPELINE PARALLELISM
 #########################################
 PIPE_REPLICATED = 'ds_pipe_replicated'
+
+#########################################
+# DATA PARALLELISM
+#########################################
+DATA_PARALLEL_GROUP = "data_parallel_group"
+GLOBAL_RANK = "global_rank"
diff --git a/deepspeed/runtime/data_pipeline/__init__.py b/deepspeed/runtime/data_pipeline/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..fcb45ab2b68516814a4bfbffebf2e01cbfefd527 100644
--- a/deepspeed/runtime/data_pipeline/__init__.py
+++ b/deepspeed/runtime/data_pipeline/__init__.py
@@ -0,0 +1 @@
+'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/data_pipeline/config.py b/deepspeed/runtime/data_pipeline/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..eefa1402e9aa6e3e585a4661a7b886d9bc3fa037
--- /dev/null
+++ b/deepspeed/runtime/data_pipeline/config.py
@@ -0,0 +1,180 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+from .constants import *
+import copy
+from ..config_utils import get_scalar_param
+
+
+# TODO: Reducing config verbosity by returning None or {} when disabled.
+# One challenge is that we still need to somehow include the default values,
+# for example the *_ENABLED has default of false.
+def get_data_efficiency_config(param_dict):
+    output = {}
+    output[DATA_EFFICIENCY_ENABLED] = get_data_efficiency_enabled(param_dict)
+    output[DATA_EFFICIENCY_SEED] = get_data_efficiency_seed(param_dict)
+    if DATA_EFFICIENCY not in param_dict.keys():
+        param_dict[DATA_EFFICIENCY] = {}
+    sub_param_dict = param_dict[DATA_EFFICIENCY]
+    output[DATA_SAMPLING] = get_data_sampling(sub_param_dict)
+    output[DATA_ROUTING] = get_data_routing(sub_param_dict)
+
+    return output
+
+
+def get_data_efficiency_enabled(param_dict):
+    if DATA_EFFICIENCY in param_dict.keys():
+        return get_scalar_param(param_dict[DATA_EFFICIENCY],
+                                DATA_EFFICIENCY_ENABLED,
+                                DATA_EFFICIENCY_ENABLED_DEFAULT)
+    else:
+        return False
+
+
+def get_data_efficiency_seed(param_dict):
+    if DATA_EFFICIENCY in param_dict.keys():
+        return get_scalar_param(param_dict[DATA_EFFICIENCY],
+                                DATA_EFFICIENCY_SEED,
+                                DATA_EFFICIENCY_SEED_DEFAULT)
+    else:
+        return DATA_EFFICIENCY_SEED_DEFAULT
+
+
+def get_data_sampling(param_dict):
+    output = {}
+    output[DATA_SAMPLING_ENABLED] = get_data_sampling_enabled(param_dict)
+    output[DATA_SAMPLING_NUM_EPOCHS] = get_data_sampling_num_epochs(param_dict)
+    output[DATA_SAMPLING_NUM_WORKERS] = get_data_sampling_num_workers(param_dict)
+    if DATA_SAMPLING not in param_dict.keys():
+        param_dict[DATA_SAMPLING] = {}
+    sub_param_dict = param_dict[DATA_SAMPLING]
+    output[CURRICULUM_LEARNING] = get_curriculum_learning(sub_param_dict)
+
+    return output
+
+
+def get_data_sampling_enabled(param_dict):
+    if DATA_SAMPLING in param_dict.keys():
+        return get_scalar_param(param_dict[DATA_SAMPLING],
+                                DATA_SAMPLING_ENABLED,
+                                DATA_SAMPLING_ENABLED_DEFAULT)
+    else:
+        return False
+
+
+def get_data_sampling_num_epochs(param_dict):
+    if DATA_SAMPLING in param_dict.keys():
+        return get_scalar_param(param_dict[DATA_SAMPLING],
+                                DATA_SAMPLING_NUM_EPOCHS,
+                                DATA_SAMPLING_NUM_EPOCHS_DEFAULT)
+    else:
+        return DATA_SAMPLING_NUM_EPOCHS_DEFAULT
+
+
+def get_data_sampling_num_workers(param_dict):
+    if DATA_SAMPLING in param_dict.keys():
+        return get_scalar_param(param_dict[DATA_SAMPLING],
+                                DATA_SAMPLING_NUM_WORKERS,
+                                DATA_SAMPLING_NUM_WORKERS_DEFAULT)
+    else:
+        return DATA_SAMPLING_NUM_WORKERS_DEFAULT
+
+
+def get_curriculum_learning(param_dict):
+    output = {}
+    output[CURRICULUM_LEARNING_ENABLED] = get_curriculum_learning_enabled(param_dict)
+    if CURRICULUM_LEARNING not in param_dict.keys():
+        param_dict[CURRICULUM_LEARNING] = {}
+    sub_param_dict = param_dict[CURRICULUM_LEARNING]
+    if output[CURRICULUM_LEARNING_ENABLED]:
+        assert CURRICULUM_LEARNING_METRICS in sub_param_dict.keys(), f"Curriculum learning is enabled, {CURRICULUM_LEARNING_METRICS} must be specified"
+        for key, val in get_curriculum_learning_params(param_dict).items():
+            output[key] = val
+    return output
+
+
+def get_curriculum_learning_enabled(param_dict):
+    if CURRICULUM_LEARNING in param_dict.keys():
+        return get_scalar_param(param_dict[CURRICULUM_LEARNING],
+                                CURRICULUM_LEARNING_ENABLED,
+                                CURRICULUM_LEARNING_ENABLED_DEFAULT)
+    else:
+        return False
+
+
+def get_curriculum_learning_params(param_dict):
+    if CURRICULUM_LEARNING in param_dict.keys():
+        curriculum_learning_params = copy.copy(param_dict[CURRICULUM_LEARNING])
+        curriculum_learning_params.pop(CURRICULUM_LEARNING_ENABLED)
+        return curriculum_learning_params
+    else:
+        return {}
+
+
+def get_curriculum_enabled_legacy(param_dict):
+    if CURRICULUM_LEARNING_LEGACY in param_dict.keys():
+        return get_scalar_param(param_dict[CURRICULUM_LEARNING_LEGACY],
+                                CURRICULUM_ENABLED_LEGACY,
+                                CURRICULUM_ENABLED_DEFAULT_LEGACY)
+    else:
+        return False
+
+
+def get_curriculum_params_legacy(param_dict):
+    if CURRICULUM_LEARNING_LEGACY in param_dict.keys():
+        curriculum_params = copy.copy(param_dict[CURRICULUM_LEARNING_LEGACY])
+        curriculum_params.pop(CURRICULUM_ENABLED_LEGACY)
+        return curriculum_params
+    else:
+        return False
+
+
+def get_data_routing(param_dict):
+    output = {}
+    output[DATA_ROUTING_ENABLED] = get_data_routing_enabled(param_dict)
+    if DATA_ROUTING not in param_dict.keys():
+        param_dict[DATA_ROUTING] = {}
+    sub_param_dict = param_dict[DATA_ROUTING]
+    output[RANDOM_LTD] = get_random_ltd(sub_param_dict)
+
+    return output
+
+
+def get_data_routing_enabled(param_dict):
+    if DATA_ROUTING in param_dict.keys():
+        return get_scalar_param(param_dict[DATA_ROUTING],
+                                DATA_ROUTING_ENABLED,
+                                DATA_ROUTING_ENABLED_DEFAULT)
+    else:
+        return False
+
+
+def get_random_ltd(param_dict):
+    output = {}
+    output[RANDOM_LTD_ENABLED] = RANDOM_LTD_ENABLED_DEFAULT
+    output[RANDOM_LTD_LAYER_TOKEN_LR_SCHEDULE] = {}
+    output[RANDOM_LTD_LAYER_TOKEN_LR_SCHEDULE][
+        RANDOM_LTD_LAYER_TOKEN_LR_ENABLED] = RANDOM_LTD_LAYER_TOKEN_LR_ENABLED_DEFAULT
+    if get_random_ltd_enabled(param_dict):
+        output[RANDOM_LTD_ENABLED] = get_random_ltd_enabled(param_dict)
+        for key, val in get_random_ltd_params(param_dict).items():
+            output[key] = val
+    return output
+
+
+def get_random_ltd_enabled(param_dict):
+    if RANDOM_LTD in param_dict.keys():
+        return get_scalar_param(param_dict[RANDOM_LTD],
+                                RANDOM_LTD_ENABLED,
+                                RANDOM_LTD_ENABLED_DEFAULT)
+    else:
+        return False
+
+
+def get_random_ltd_params(param_dict):
+    if RANDOM_LTD in param_dict.keys():
+        random_ltd_params = copy.copy(param_dict[RANDOM_LTD])
+        random_ltd_params.pop(RANDOM_LTD_ENABLED)
+        return random_ltd_params
+    else:
+        return {}
diff --git a/deepspeed/runtime/data_pipeline/constants.py b/deepspeed/runtime/data_pipeline/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..b801d2e956fc935d92698ac127051327164f202d
--- /dev/null
+++ b/deepspeed/runtime/data_pipeline/constants.py
@@ -0,0 +1,115 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+#########################################
+# Data efficiency library
+# See sample config at https://www.deepspeed.ai/docs/config-json/#data-efficiency
+#########################################
+DATA_EFFICIENCY = "data_efficiency"
+DATA_EFFICIENCY_ENABLED = "enabled"
+DATA_EFFICIENCY_ENABLED_DEFAULT = False
+DATA_EFFICIENCY_SEED = "seed"
+DATA_EFFICIENCY_SEED_DEFAULT = 1234
+
+#########################################
+# Data efficiency - Data Sampling
+#########################################
+DATA_SAMPLING = "data_sampling"
+DATA_SAMPLING_ENABLED = "enabled"
+DATA_SAMPLING_ENABLED_DEFAULT = False
+DATA_SAMPLING_NUM_EPOCHS = "num_epochs"
+DATA_SAMPLING_NUM_EPOCHS_DEFAULT = 1000
+DATA_SAMPLING_NUM_WORKERS = "num_workers"
+DATA_SAMPLING_NUM_WORKERS_DEFAULT = 0
+
+#########################################
+# Data efficiency - Data Sampling - Curriculum Learning
+#########################################
+CURRICULUM_LEARNING = "curriculum_learning"
+CURRICULUM_LEARNING_ENABLED = "enabled"
+CURRICULUM_LEARNING_ENABLED_DEFAULT = False
+CURRICULUM_LEARNING_CLUSTER_PATH = "data_cluster_path"
+CURRICULUM_LEARNING_METRICS = "curriculum_metrics"
+CURRICULUM_LEARNING_SAMPLE_PATH = "index_to_sample_path"
+CURRICULUM_LEARNING_METRIC_PATH = "index_to_metric_path"
+CURRICULUM_LEARNING_CLUSTERING_TYPE = "clustering_type"
+CURRICULUM_LEARNING_SINGLE_CLUSTER = "single_cluster"
+CURRICULUM_LEARNING_CLUSTER_PREFIX = "cluster"
+CURRICULUM_LEARNING_DIFFICULTY_TYPE = "difficulty_type"
+CURRICULUM_LEARNING_VALUE_BASED = "value"
+CURRICULUM_LEARNING_PERCENTILE_BASED = "percentile"
+CURRICULUM_LEARNING_MIN_DIFFICULTY = "min_difficulty"
+CURRICULUM_LEARNING_MAX_DIFFICULTY = "max_difficulty"
+CURRICULUM_LEARNING_SCHEDULE_TYPE = "schedule_type"
+CURRICULUM_LEARNING_SCHEDULE_CONFIG = "schedule_config"
+CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY = "difficulty"
+CURRICULUM_LEARNING_SCHEDULE_MAX_STEP = "max_step"
+CURRICULUM_LEARNING_SCHEDULE_TOTAL_STEP = "total_curriculum_step"
+CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP = "difficulty_step"
+CURRICULUM_LEARNING_SCHEDULE_ROOT_DEGREE = "root_degree"
+CURRICULUM_LEARNING_SCHEDULE_FIXED_DISCRETE = "fixed_discrete"
+CURRICULUM_LEARNING_SCHEDULE_FIXED_ROOT = "fixed_root"
+CURRICULUM_LEARNING_SCHEDULE_FIXED_LINEAR = "fixed_linear"
+CURRICULUM_LEARNING_SCHEDULE_CUSTOM = "custom"
+CURRICULUM_LEARNING_CURRENT_DIFFICULTY = "current_difficulty"
+
+CURRICULUM_LEARNING_BATCH = "batch"
+CURRICULUM_LEARNING_CONSUMED_SAMPLES = "consumed_samples"
+CURRICULUM_LEARNING_STEP = "curriculum_step"
+CURRICULUM_LEARNING_CURRENT_DIFFICULTIES = "current_difficulties"
+CURRICULUM_LEARNING_DATA_CLUSTER_PATHS = "data_cluster_paths"
+CURRICULUM_LEARNING_DATA_CLUSTER_CURRENT_POSITION = "data_cluster_current_position"
+CURRICULUM_LEARNING_NP_RNG_STATE = "np_rng_state"
+
+#########################################
+# Curriculum Learning legacy implementation
+#########################################
+CURRICULUM_LEARNING_LEGACY = "curriculum_learning"
+
+CURRICULUM_ENABLED_LEGACY = "enabled"
+CURRICULUM_ENABLED_DEFAULT_LEGACY = False
+
+#########################################
+# Data efficiency - Data Routing
+#########################################
+DATA_ROUTING = "data_routing"
+DATA_ROUTING_ENABLED = "enabled"
+DATA_ROUTING_ENABLED_DEFAULT = False
+
+#########################################
+# Data efficiency - Data Routing - Random LTD
+#########################################
+RANDOM_LTD = "random_ltd"
+RANDOM_LTD_ENABLED = "enabled"
+RANDOM_LTD_ENABLED_DEFAULT = False
+
+RANDOM_LTD_MODEL_MASK_NAME = "model_mask_name"
+RANDOM_LTD_MODEL_TYPE = "model_type"
+RANDOM_LTD_MICRO_BATCH_SIZE = "micro_batch_size"
+RANDOM_LTD_GLOBAL_BATCH_SIZE = "global_batch_size"
+RANDOM_LTD_SAMPLE_INDEX = "sample_idx"
+RANDOM_LTD_ATTENTION_MASK = "attention_mask"
+RANDOM_LTD_HIDDEN_STATE_ORDER = "hidden_state_order"
+RANDOM_LTD_LAYER_NUM = "random_ltd_layer_num"
+RANDOM_LTD_LAYER_ID = "random_ltd_layer_id"
+RANDOM_LTD_TOTAL_LAYER_NUM = "total_layer_num"
+RANDOM_LTD_CONSUMED_LAYER_TOKENS = "consumed_layer_tokens"
+
+# scheduler
+RANDOM_LTD_SCHEDULER = "random_ltd_schedule"
+RANDOM_LTD_MAX_VALUE = "max_value"
+RANDOM_LTD_MIN_VALUE = "min_value"
+RANDOM_LTD_CURRENT_VALUE = "current_value"
+RANDOM_LTD_SCHEDULE_CONFIG = "schedule_config"
+RANDOM_LTD_INCREASE_STEP = "seq_per_step"
+RANDOM_LTD_REQUIRE_STEP = "require_steps"
+RANDOM_LTD_SCHEDULER_TYPE = "schedule_type"
+RANDOM_LTD_CURR_STEP = "current_steps"
+
+# learning rate schedulers
+RANDOM_LTD_LAYER_TOKEN_LR_SCHEDULE = "layer_token_lr_schedule"
+RANDOM_LTD_LAYER_TOKEN_LR_ENABLED = "enabled"
+RANDOM_LTD_LAYER_TOKEN_LR_ENABLED_DEFAULT = False
+RANDOM_LTD_TOTAL_LAYER_TOKENS = "total_layer_tokens"
+RANDOM_LTD_WARMUP_TYPE = "warmup_type"
+RANDOM_LTD_WARMUP_LAYER_TOKENS = "warmup_layer_tokens"
diff --git a/deepspeed/runtime/data_pipeline/curriculum_scheduler.py b/deepspeed/runtime/data_pipeline/curriculum_scheduler.py
index 24ff1f8d975a11cb10b6e75a3027dbdad92d1d72..b4cb18c4476b98556cc72cce5ea9f7bd0b571f4a 100644
--- a/deepspeed/runtime/data_pipeline/curriculum_scheduler.py
+++ b/deepspeed/runtime/data_pipeline/curriculum_scheduler.py
@@ -3,22 +3,30 @@ Copyright 2021 The Microsoft DeepSpeed Team
 '''
 import math
 from deepspeed.utils import logger
+from .constants import *
 
 
 class CurriculumScheduler(object):
     def __init__(self, config):
         super().__init__()
         self.state = {}
-        assert "curriculum_type" in config, "Curriculum learning requires the config 'curriculum_type'"
-        assert "min_difficulty" in config, "Curriculum learning requires the config 'min_difficulty'"
-        assert "max_difficulty" in config, "Curriculum learning requires the config 'max_difficulty'"
-        assert "schedule_type" in config, "Curriculum learning requires the config 'schedule_type'"
-        self.state['min_difficulty'] = config['min_difficulty']
-        self.state['max_difficulty'] = config['max_difficulty']
-        self.state['current_difficulty'] = config['min_difficulty']
-        self.state['schedule_type'] = config['schedule_type']
+        assert CURRICULUM_LEARNING_MIN_DIFFICULTY in config, \
+            f"Curriculum learning requires the config '{CURRICULUM_LEARNING_MIN_DIFFICULTY}'"
+        assert CURRICULUM_LEARNING_MAX_DIFFICULTY in config, \
+            f"Curriculum learning requires the config '{CURRICULUM_LEARNING_MAX_DIFFICULTY}'"
+        assert CURRICULUM_LEARNING_SCHEDULE_TYPE in config, \
+            f"Curriculum learning requires the config '{CURRICULUM_LEARNING_SCHEDULE_TYPE}'"
+        self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY] = config[
+            CURRICULUM_LEARNING_MIN_DIFFICULTY]
+        self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY] = config[
+            CURRICULUM_LEARNING_MAX_DIFFICULTY]
+        self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] = config[
+            CURRICULUM_LEARNING_MIN_DIFFICULTY]
+        self.state[CURRICULUM_LEARNING_SCHEDULE_TYPE] = config[
+            CURRICULUM_LEARNING_SCHEDULE_TYPE]
         self.first_step = True
-        if config['schedule_type'] == 'fixed_discrete':
+        if config[
+                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_DISCRETE:
             """
             The schedule_config is a list of difficulty and a list of max
             step belonging to each difficulty. Example json config:
@@ -28,17 +36,25 @@ class CurriculumScheduler(object):
             }
             The "max_step" has one less element than "difficulty", because
             the last difficulty will be used for all following steps.
-            The self.state['schedule'] is a dictionary of
+            The self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] is a dictionary of
             difficulty : [max step for this difficulty, next difficulty].
             """
-            assert "difficulty" in config['schedule_config'], "Curriculum learning with fixed_discrete schedule requires the schedule_config 'difficulty'"
-            assert "max_step" in config['schedule_config'], "Curriculum learning with fixed_discrete schedule requires the schedule_config 'max_step'"
-            assert len(config['schedule_config']['max_step']) > 0
-            assert len(config['schedule_config']['difficulty']) > 0
-            assert len(config['schedule_config']['difficulty']) == len(
-                config['schedule_config']['max_step']) + 1
-            self.state['schedule'] = config['schedule_config']
-        elif config['schedule_type'] == 'fixed_root':
+            assert CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY in config[CURRICULUM_LEARNING_SCHEDULE_CONFIG], \
+                f"Curriculum learning with fixed_discrete schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY}'"
+            assert CURRICULUM_LEARNING_SCHEDULE_MAX_STEP in config[CURRICULUM_LEARNING_SCHEDULE_CONFIG], \
+                f"Curriculum learning with fixed_discrete schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_MAX_STEP}'"
+            assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
+                       [CURRICULUM_LEARNING_SCHEDULE_MAX_STEP]) > 0
+            assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
+                       [CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY]) > 0
+            assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
+                       [CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY]) == len(
+                           config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
+                           [CURRICULUM_LEARNING_SCHEDULE_MAX_STEP]) + 1
+            self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[
+                CURRICULUM_LEARNING_SCHEDULE_CONFIG]
+        elif config[
+                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_ROOT:
             """
             The schedule_config includes:
             total_curriculum_step: how many steps the curriculum learning takes to go
@@ -57,38 +73,59 @@ class CurriculumScheduler(object):
               "root_degree": 2
             }
             """
-            assert "total_curriculum_step" in config['schedule_config'], "Curriculum learning with fixed_root schedule requires the schedule_config 'total_curriculum_step'"
-            assert "difficulty_step" in config['schedule_config'], "Curriculum learning with fixed_root schedule requires the schedule_config 'difficulty_step'"
-            assert "root_degree" in config['schedule_config'], "Curriculum learning with fixed_root schedule requires the schedule_config 'root_degree'"
-            if config['schedule_config']['difficulty_step'] % 8 != 0:
+            assert CURRICULUM_LEARNING_SCHEDULE_TOTAL_STEP in config[CURRICULUM_LEARNING_SCHEDULE_CONFIG], \
+                f"Curriculum learning with fixed_root schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_TOTAL_STEP}'"
+            assert CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP in config[CURRICULUM_LEARNING_SCHEDULE_CONFIG], \
+                f"Curriculum learning with fixed_root schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP}'"
+            assert CURRICULUM_LEARNING_SCHEDULE_ROOT_DEGREE in config[CURRICULUM_LEARNING_SCHEDULE_CONFIG], \
+                f"Curriculum learning with fixed_root schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_ROOT_DEGREE}'"
+            if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][
+                    CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0:
                 logger.warning(
-                    f'The difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your hardware.'
+                    f'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.'
                 )
-            self.state['schedule'] = config['schedule_config']
-        elif config['schedule_type'] == 'fixed_linear':
+            self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[
+                CURRICULUM_LEARNING_SCHEDULE_CONFIG]
+        elif config[
+                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_LINEAR:
             """
-            The schedule_config is the same as 'fixed_root' but without the
+            The schedule_config is the same as CURRICULUM_LEARNING_SCHEDULE_FIXED_ROOT but without the
             root_degree.
             "schedule_config": {
               "total_curriculum_step": 30000,
               "difficulty_step": 8
             }
             """
-            assert "total_curriculum_step" in config['schedule_config'], "Curriculum learning with fixed_linear schedule requires the schedule_config 'total_curriculum_step'"
-            assert "difficulty_step" in config['schedule_config'], "Curriculum learning with fixed_linear schedule requires the schedule_config 'difficulty_step'"
-            if config['schedule_config']['difficulty_step'] % 8 != 0:
+            assert CURRICULUM_LEARNING_SCHEDULE_TOTAL_STEP in config[CURRICULUM_LEARNING_SCHEDULE_CONFIG], \
+                f"Curriculum learning with fixed_linear schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_TOTAL_STEP}'"
+            assert CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP in config[CURRICULUM_LEARNING_SCHEDULE_CONFIG], \
+                f"Curriculum learning with fixed_linear schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP}'"
+            if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][
+                    CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0:
                 logger.warning(
-                    f'The difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your hardware.'
+                    f'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.'
                 )
-            self.state['schedule'] = config['schedule_config']
+            self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[
+                CURRICULUM_LEARNING_SCHEDULE_CONFIG]
+        elif config[
+                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_CUSTOM:
+            """
+            Fully customized schedule. User need to provide a custom schedule
+            function by using the set_custom_curriculum_learning_schedule API
+            in deepspeed/runtime/engine.py
+            """
+            self.custom_get_difficulty = None
         else:
             raise RuntimeError('Unsupported curriculum schedule type')
 
     def get_current_difficulty(self):
-        return self.state['current_difficulty']
+        return self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY]
 
     def set_current_difficulty(self, difficulty):
-        self.state['current_difficulty'] = difficulty
+        self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] = difficulty
+
+    def set_custom_get_difficulty(self, schedule_function):
+        self.custom_get_difficulty = schedule_function
 
     def get_state(self):
         return self.state
@@ -97,38 +134,49 @@ class CurriculumScheduler(object):
         self.state = state
 
     def __fixed_discrete_get_difficulty(self, global_steps):
-        s_state = self.state['schedule']
-        if global_steps > s_state['max_step'][-1]:
-            return s_state['difficulty'][-1]
-        for i in range(len(s_state['max_step'])):
-            if global_steps <= s_state['max_step'][i]:
-                return s_state['difficulty'][i]
+        s_state = self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
+        if global_steps > s_state[CURRICULUM_LEARNING_SCHEDULE_MAX_STEP][-1]:
+            return s_state[CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY][-1]
+        for i in range(len(s_state[CURRICULUM_LEARNING_SCHEDULE_MAX_STEP])):
+            if global_steps <= s_state[CURRICULUM_LEARNING_SCHEDULE_MAX_STEP][i]:
+                return s_state[CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY][i]
 
     def __fixed_root_get_difficulty(self, global_steps, root_degree=None):
-        s_state = self.state['schedule']
+        s_state = self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
         if root_degree is None:
-            root_degree = s_state['root_degree']
+            root_degree = s_state[CURRICULUM_LEARNING_SCHEDULE_ROOT_DEGREE]
         next_difficulty = (float(global_steps) /
-                           s_state['total_curriculum_step'])**(1.0 / root_degree)
-        next_difficulty = math.floor(
-            next_difficulty *
-            (self.state['max_difficulty'] - self.state['min_difficulty']) +
-            self.state['min_difficulty'])
-        next_difficulty -= (next_difficulty % s_state['difficulty_step'])
-        next_difficulty = min(next_difficulty, self.state['max_difficulty'])
+                           s_state[CURRICULUM_LEARNING_SCHEDULE_TOTAL_STEP])**(
+                               1.0 / root_degree)
+        next_difficulty = math.floor(next_difficulty *
+                                     (self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY] -
+                                      self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY]) +
+                                     self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY])
+        next_difficulty -= (next_difficulty %
+                            s_state[CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP])
+        next_difficulty = min(next_difficulty,
+                              self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY])
         return next_difficulty
 
     def get_difficulty(self, global_steps):
-        if self.state['schedule_type'] == 'fixed_discrete':
+        if self.state[
+                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_DISCRETE:
             return self.__fixed_discrete_get_difficulty(global_steps)
-        elif self.state['schedule_type'] == 'fixed_linear':
+        elif self.state[
+                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_LINEAR:
             return self.__fixed_root_get_difficulty(global_steps, 1)
-        elif self.state['schedule_type'] == 'fixed_root':
+        elif self.state[
+                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_ROOT:
             return self.__fixed_root_get_difficulty(global_steps)
+        elif self.state[
+                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_CUSTOM:
+            return self.custom_get_difficulty(global_steps)
         else:
             raise RuntimeError('Unsupported curriculum schedule type')
 
     def update_difficulty(self, global_steps):
-        if self.state['current_difficulty'] < self.state['max_difficulty']:
-            self.state['current_difficulty'] = self.get_difficulty(global_steps)
-        return self.state['current_difficulty']
+        if self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] < self.state[
+                CURRICULUM_LEARNING_MAX_DIFFICULTY]:
+            self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] = self.get_difficulty(
+                global_steps)
+        return self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY]
diff --git a/deepspeed/runtime/data_pipeline/data_routing/__init__.py b/deepspeed/runtime/data_pipeline/data_routing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcb45ab2b68516814a4bfbffebf2e01cbfefd527
--- /dev/null
+++ b/deepspeed/runtime/data_pipeline/data_routing/__init__.py
@@ -0,0 +1 @@
+'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/data_pipeline/data_routing/basic_layer.py b/deepspeed/runtime/data_pipeline/data_routing/basic_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..436da95380e7e200dc53aea268a6334a21dbf33b
--- /dev/null
+++ b/deepspeed/runtime/data_pipeline/data_routing/basic_layer.py
@@ -0,0 +1,117 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+from deepspeed.utils import logger
+from torch import Tensor
+from torch.nn import Module
+from ..constants import *
+from deepspeed.ops.random_ltd.dropping_utils import gpt_sample_tokens, bert_sample_tokens, GatherTokens, ScatterTokens
+
+
+#####based on the paper random-ltd: https://arxiv.org/abs/2211.11586
+class RandomLayerTokenDrop(Module):
+    """
+    A  layer wrapper for random LTD
+    """
+    def __init__(self, layer: Module):
+        super(RandomLayerTokenDrop, self).__init__()
+        self.random_ltd_layer = layer
+        self.reserved_length = None  #config['max_value']
+        self.random_ltd_scheduler = None
+        self.max_length = None
+        self.reserved_length = -1
+        self.curr_seq = -1
+        self.batch_first = False
+
+    def init_config(self, config, scheduler, random_ltd_layer_id):
+        self.random_ltd_scheduler = scheduler
+        self.random_ltd_layer_id = random_ltd_layer_id
+        self.max_length = self.random_ltd_scheduler.state[RANDOM_LTD_MAX_VALUE]
+
+        self.mask_name = config[RANDOM_LTD_MODEL_MASK_NAME]
+        self.micro_bs = config[RANDOM_LTD_MICRO_BATCH_SIZE]
+        self.random_ltd_num_layer = self.random_ltd_scheduler.random_ltd_layer_num
+        hs_order = config[RANDOM_LTD_HIDDEN_STATE_ORDER]
+        self.model_type = config[RANDOM_LTD_MODEL_TYPE]
+
+        if hs_order == 'batch_seq_dim':
+            self.get_hidden_tensor_shape = self.get_bsh
+            self.batch_first = True
+        elif hs_order == 'seq_batch_dim':
+            self.get_hidden_tensor_shape = self.get_sbh
+            self.batch_first = False
+        else:
+            logger.warning(
+                "************For now, we only support batch_seq_dim or seq_batch_dim inputs. You can easily \
+                     your own input dimension orders************")
+            raise NotImplementedError
+
+        if self.model_type == 'encoder':
+            self.index_generator = bert_sample_tokens
+        elif self.model_type == 'decoder':
+            self.index_generator = gpt_sample_tokens
+        else:
+            logger.warning(
+                "************For now, we only support encoder-only or decoder-only models************"
+            )
+            raise NotImplementedError
+
+    def get_bsh(self, hidden_stats):
+        self.curr_seq, self.curr_micro_batch = hidden_stats.size()[1], hidden_stats.size()[0]
+
+    def get_sbh(self, hidden_stats):
+        self.curr_seq, self.curr_micro_batch = hidden_stats.size()[0], hidden_stats.size()[1]
+
+    def forward(self, hidden_states, **kwargs) -> Tensor:
+        if self.random_ltd_scheduler is not None:
+            self.reserved_length = self.random_ltd_scheduler.get_current_seq()
+            self.get_hidden_tensor_shape(hidden_states)
+        if self.training and self.random_ltd_scheduler is not None and self.reserved_length < self.curr_seq:
+            if self.mask_name is not None:
+                mask = kwargs[self.mask_name]
+            else:
+                mask = None
+            if self.random_ltd_layer_id == 0:
+                sampled_indices, part_attention_mask = self.index_generator(self.reserved_length,\
+                                                                                      self.curr_seq, \
+                                                                                      self.curr_micro_batch, \
+                                                                                      self.random_ltd_num_layer, \
+                                                                                      hidden_states.device, mask)
+                self.random_ltd_scheduler.state[
+                    RANDOM_LTD_SAMPLE_INDEX] = sampled_indices
+                self.random_ltd_scheduler.state[
+                    RANDOM_LTD_ATTENTION_MASK] = part_attention_mask
+            else:
+                sampled_indices = self.random_ltd_scheduler.state[
+                    RANDOM_LTD_SAMPLE_INDEX]
+                part_attention_mask = self.random_ltd_scheduler.state[
+                    RANDOM_LTD_ATTENTION_MASK]
+
+
+            hidden_states, part_hidden_states = GatherTokens.apply(hidden_states, sampled_indices[self.random_ltd_layer_id,:,:], self.batch_first)
+            if self.mask_name is not None:
+                if self.model_type == 'encoder':
+                    kwargs[self.mask_name] = part_attention_mask[
+                        self.random_ltd_layer_id]
+                else:
+                    kwargs[self.mask_name] = part_attention_mask
+
+            outputs = self.random_ltd_layer(part_hidden_states, **kwargs)
+
+            if isinstance(outputs, tuple):
+                hidden_states = ScatterTokens.apply(hidden_states, outputs[0], sampled_indices[self.random_ltd_layer_id,:,:], self.batch_first)
+                my_list = list(outputs)
+                my_list[0] = hidden_states
+                return tuple(my_list)
+            elif isinstance(outputs, Tensor):
+                hidden_states = ScatterTokens.apply(hidden_states, outputs, sampled_indices[self.random_ltd_layer_id,:,:], self.batch_first)
+                return hidden_states
+            else:
+                logger.warning(
+                    "************For now, we only support tuple and tensor output.  \
+                       You need to adjust the output according to the layer in your model************"
+                )
+                raise NotImplementedError
+        else:
+            return self.random_ltd_layer(hidden_states, **kwargs)
diff --git a/deepspeed/runtime/data_pipeline/data_routing/helper.py b/deepspeed/runtime/data_pipeline/data_routing/helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f3791cf6e57b96246c960880bc33594bcef20ae
--- /dev/null
+++ b/deepspeed/runtime/data_pipeline/data_routing/helper.py
@@ -0,0 +1,45 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+from .basic_layer import RandomLayerTokenDrop
+from collections import OrderedDict
+from deepspeed.compression.helper import recursive_getattr, recursive_setattr
+
+
+def convert_to_random_ltd(model, convert_type):
+    if hasattr(model, 'module'):
+        c_model = model.module
+    else:
+        c_model = model
+
+    for name, module in c_model.named_modules():
+
+        if isinstance(module, convert_type):
+            old_module = recursive_getattr(c_model, name)
+            new_module = RandomLayerTokenDrop(old_module)
+            recursive_setattr(c_model, name, new_module)
+
+    model.random_ltd_initialize()
+    return model
+
+
+def save_without_random_ltd(model):
+    if hasattr(model, 'module'):
+        c_model = model.module
+    else:
+        c_model = model
+
+    model_dic = c_model.state_dict()
+    return remove_random_ltd_state_dict(model_dic)
+
+
+def remove_random_ltd_state_dict(state_dict):
+    new_state_dict = OrderedDict()
+    for key, value in state_dict.items():
+        if '.random_ltd_layer' in key:
+            new_key = ''.join(key.split('.random_ltd_layer'))
+        else:
+            new_key = key
+        new_state_dict[new_key] = value
+    return new_state_dict
diff --git a/deepspeed/runtime/data_pipeline/data_routing/scheduler.py b/deepspeed/runtime/data_pipeline/data_routing/scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..db0a7d4bce99c5aedad08035f777c134b4205e52
--- /dev/null
+++ b/deepspeed/runtime/data_pipeline/data_routing/scheduler.py
@@ -0,0 +1,112 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+import math
+
+from deepspeed.utils import logger
+# from deepspeed.runtime.lr_schedules import WarmupLR
+from ..constants import *
+
+#####based on the paper random-ltd: https://arxiv.org/abs/2211.11586
+
+
+class BaseScheduler(object):
+    def __init__(self):
+        self.state = {}
+
+    def __fixed_root_get_value(self, global_steps, root_degree=None):
+        s_state = self.state[RANDOM_LTD_SCHEDULE_CONFIG]
+        if root_degree is None:
+            root_degree = s_state['root_degree']
+        next_seq = (float(global_steps) /
+                    s_state[RANDOM_LTD_REQUIRE_STEP])**(1.0 / root_degree)
+        next_seq = math.floor(
+            next_seq *
+            (self.state[RANDOM_LTD_MAX_VALUE] - self.state[RANDOM_LTD_MIN_VALUE]) +
+            self.state[RANDOM_LTD_MIN_VALUE])
+        next_seq -= (next_seq % s_state[RANDOM_LTD_INCREASE_STEP])
+        next_seq = min(next_seq, self.state[RANDOM_LTD_MAX_VALUE])
+        return next_seq
+
+    def get_value(self, global_steps):
+        if self.state[RANDOM_LTD_SCHEDULER_TYPE] == 'fixed_linear':
+            return self.__fixed_root_get_value(global_steps, 1)
+        else:
+            raise RuntimeError('Unsupported random LTD schedule type')
+
+
+class RandomLTDScheduler(BaseScheduler):
+    def __init__(self, config):
+        super().__init__()
+        self.model_layer_num = config[RANDOM_LTD_TOTAL_LAYER_NUM]
+        self.random_ltd_layer_num = config[RANDOM_LTD_LAYER_NUM]
+        self.config_schedule = config[RANDOM_LTD_SCHEDULER]
+        self.global_batch_size = config[RANDOM_LTD_GLOBAL_BATCH_SIZE]
+        self.reset_to_init()
+
+        if config[RANDOM_LTD_LAYER_TOKEN_LR_SCHEDULE][RANDOM_LTD_LAYER_TOKEN_LR_ENABLED]:
+            logger.warning("**********Work In Progress************")
+            raise NotImplementedError
+
+        self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS] = 0
+
+        # self.first_step = True
+    def get_total_layer_tokens(self, train_iters):
+        for step in range(train_iters):
+            self.update_seq(step)
+        return self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS]
+
+    def reset_to_init(self):
+        if self.config_schedule is not None:
+            self.state[RANDOM_LTD_MIN_VALUE] = self.config_schedule[RANDOM_LTD_MIN_VALUE]
+            self.state[RANDOM_LTD_MAX_VALUE] = self.config_schedule[RANDOM_LTD_MAX_VALUE]
+            self.state[RANDOM_LTD_CURRENT_VALUE] = self.config_schedule[
+                RANDOM_LTD_MIN_VALUE]
+            self.state[RANDOM_LTD_SCHEDULE_CONFIG] = self.config_schedule[
+                RANDOM_LTD_SCHEDULE_CONFIG]
+            self.state[RANDOM_LTD_SCHEDULER_TYPE] = self.config_schedule[
+                RANDOM_LTD_SCHEDULER_TYPE]
+        self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS] = 0
+        self.state[RANDOM_LTD_CURR_STEP] = -1
+
+    def get_current_seq(self):
+        return self.state[RANDOM_LTD_CURRENT_VALUE]
+
+    def set_current_seq(self, seq_length):
+        self.state[RANDOM_LTD_CURRENT_VALUE] = seq_length
+
+    def get_random_ltd_layer_num(self):
+        return self.random_ltd_layer_num
+
+    def get_state(self):
+        return self.state
+
+    def set_state(self, state):
+        self.state = state
+
+    def update_seq(self, global_steps):
+        if self.state[RANDOM_LTD_CURRENT_VALUE] < self.state[RANDOM_LTD_MAX_VALUE]:
+            self.state[RANDOM_LTD_CURRENT_VALUE] = self.get_value(global_steps)
+        if global_steps != self.state[RANDOM_LTD_CURR_STEP]:
+            self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS] += self.global_batch_size*(self.state[RANDOM_LTD_CURRENT_VALUE] * self.random_ltd_layer_num \
+                + self.state[RANDOM_LTD_MAX_VALUE] * (self.model_layer_num - self.random_ltd_layer_num))
+            self.state[RANDOM_LTD_CURR_STEP] = global_steps
+
+    def state_dict(self):
+        return {
+            RANDOM_LTD_CONSUMED_LAYER_TOKENS:
+            self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS],
+            RANDOM_LTD_CURR_STEP: self.state[RANDOM_LTD_CURR_STEP],
+            RANDOM_LTD_CURRENT_VALUE: self.state[RANDOM_LTD_CURRENT_VALUE],
+            RANDOM_LTD_MIN_VALUE: self.state[RANDOM_LTD_MIN_VALUE],
+            RANDOM_LTD_MAX_VALUE: self.state[RANDOM_LTD_MAX_VALUE],
+        }
+
+    def load_state_dict(self, state_dict):
+        self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS] = state_dict[
+            RANDOM_LTD_CONSUMED_LAYER_TOKENS]
+        self.state[RANDOM_LTD_CURR_STEP] = state_dict[RANDOM_LTD_CURR_STEP]
+        self.state[RANDOM_LTD_CURRENT_VALUE] = state_dict[RANDOM_LTD_CURRENT_VALUE]
+        self.state[RANDOM_LTD_MIN_VALUE] = state_dict[RANDOM_LTD_MIN_VALUE]
+        self.state[RANDOM_LTD_MAX_VALUE] = state_dict[RANDOM_LTD_MAX_VALUE]
diff --git a/deepspeed/runtime/data_pipeline/data_routing/utils.py b/deepspeed/runtime/data_pipeline/data_routing/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b7bd501c75f4b8f11b6b991d7835536dd303206
--- /dev/null
+++ b/deepspeed/runtime/data_pipeline/data_routing/utils.py
@@ -0,0 +1,27 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+import torch
+
+
+def bsh_decoder_gather(reserved_length, hidden_states, mask):
+    # random-layer-token-drop
+    rand_list = []
+    part_hidden_states = []  #  batch, seq, hidden ## different from megatron
+    for k in range(hidden_states.size(0)):
+        B_tmp = torch.randperm(hidden_states.size(1),
+                               device=hidden_states.device)[:reserved_length]
+        B = B_tmp.sort()[0]
+        rand_list.append(B)
+        part_hidden_states.append(hidden_states[k:k + 1, B, :])
+
+    part_hidden_states = torch.cat(part_hidden_states, dim=0)
+    part_mask = mask[:, :, :reserved_length, :reserved_length]
+    return part_hidden_states, rand_list, part_mask
+
+
+def bsh_decoder_scatter(hidden_states, part_hidden_states, rand_list):
+    for k in range(hidden_states.size(0)):
+        hidden_states[k, rand_list[k], :] = part_hidden_states[k, :, :]
+    return hidden_states
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/__init__.py b/deepspeed/runtime/data_pipeline/data_sampling/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcb45ab2b68516814a4bfbffebf2e01cbfefd527
--- /dev/null
+++ b/deepspeed/runtime/data_pipeline/data_sampling/__init__.py
@@ -0,0 +1 @@
+'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..efd198f1fb041c307afdde0712f79c53da1a5533
--- /dev/null
+++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py
@@ -0,0 +1,537 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+import os
+from collections import defaultdict
+import csv
+import time
+from multiprocessing import Process, Manager
+import numpy as np
+import torch
+from torch.utils.data import BatchSampler, SequentialSampler, DataLoader, Subset
+
+from deepspeed.utils import logger
+from .indexed_dataset import MMapIndexedDataset
+from .utils import split_dataset, split_index, create_mmap_dataset_builder, close_mmap_dataset_builder, find_fit_int_dtype
+
+
+class DataAnalyzer(object):
+    def __init__(self,
+                 dataset,
+                 num_workers=1,
+                 worker_id=0,
+                 num_threads=1,
+                 num_threads_reduce=1,
+                 specific_threads=[],
+                 batch_size=1,
+                 metric_names=[],
+                 metric_functions=[],
+                 metric_types=[],
+                 metric_dtypes=[],
+                 save_path="./",
+                 collate_fn=None,
+                 custom_map_init=None,
+                 custom_map_update=None,
+                 custom_map_finalize=None,
+                 custom_reduce=None):
+        super().__init__()
+        self.dataset = dataset
+        self.num_workers = num_workers
+        self.worker_id = worker_id
+        self.num_threads = num_threads
+        self.num_threads_reduce = num_threads_reduce
+        self.specific_threads = specific_threads
+        self.batch_size = batch_size
+        self.metric_names = metric_names
+        self.metric_functions = metric_functions
+        self.metric_types = metric_types
+        self.metric_dtypes = metric_dtypes
+        self.save_path = save_path
+        self.collate_fn = collate_fn
+        self.custom_map_init = custom_map_init
+        self.custom_map_update = custom_map_update
+        self.custom_map_finalize = custom_map_finalize
+        self.custom_reduce = custom_reduce
+
+    def init_metric_results(self,
+                            thread_id,
+                            metric_names,
+                            metric_types,
+                            metric_dtypes,
+                            save_path,
+                            worker_id):
+        metric_results = []
+        for m_idx in range(len(metric_names)):
+            metric_name, metric_type, metric_dtype = metric_names[m_idx], \
+                metric_types[m_idx], metric_dtypes[m_idx]
+            assert metric_dtype not in [np.float64, np.double], "Currently floating point metric values are not supported. Please change your metric into integer values (and potentially multiply a larger coefficient to keep the precision)."
+            metric_save_path = f"{save_path}/{metric_name}/worker{worker_id}_thread{thread_id}/"
+            os.makedirs(metric_save_path, exist_ok=True)
+            if metric_type == 'single_value_per_sample':
+                sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric"
+                sample_to_metric_builder = create_mmap_dataset_builder(
+                    sample_to_metric_fname,
+                    metric_dtype)
+                metric_to_sample_fname = f"{metric_save_path}/{metric_name}_metric_to_sample"
+                os.system(f"rm -rf {metric_to_sample_fname}*")
+                metric_to_sample_dict = defaultdict(list)
+                metric_results.append({
+                    "sample_to_metric_fname": sample_to_metric_fname,
+                    "sample_to_metric_builder": sample_to_metric_builder,
+                    "metric_to_sample_fname": metric_to_sample_fname,
+                    "metric_to_sample_dict": metric_to_sample_dict
+                })
+            elif metric_type == 'accumulate_value_over_samples':
+                metric_value = None
+                metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value"
+                metric_results.append({
+                    "metric_value": metric_value,
+                    "metric_value_fname": metric_value_fname
+                })
+        return metric_results
+
+    def update_metric_results(self,
+                              data,
+                              metric_types,
+                              metric_functions,
+                              metric_results):
+        for m_idx in range(len(metric_types)):
+            metric_type, metric_function, metric_result = metric_types[m_idx], \
+                metric_functions[m_idx], metric_results[m_idx]
+            if metric_type == 'single_value_per_sample':
+                metric_values = metric_function(data)
+                for row in range(metric_values.size()[0]):
+                    metric_result["sample_to_metric_builder"].add_item(
+                        metric_values[row].reshape(-1))
+                    metric_result["metric_to_sample_dict"][
+                        metric_values[row].item()].append(data['index'][row][0].item())
+                for m_value in metric_result["metric_to_sample_dict"]:
+                    if len(metric_result["metric_to_sample_dict"][m_value]) > 100:
+                        metric_fname = metric_result["metric_to_sample_fname"]
+                        with open(f"{metric_fname}_{m_value}.csv", 'a') as f:
+                            writer = csv.writer(f)
+                            writer.writerows(
+                                [metric_result["metric_to_sample_dict"][m_value]])
+                        metric_result["metric_to_sample_dict"][m_value] = []
+            elif metric_type == 'accumulate_value_over_samples':
+                metric_values = metric_function(data)
+                if metric_result["metric_value"] is None:
+                    metric_result["metric_value"] = metric_values
+                else:
+                    metric_result["metric_value"].add_(metric_values)
+
+    def finalize_metric_results(self, metric_types, metric_dtypes, metric_results):
+        for m_idx in range(len(metric_types)):
+            metric_type, metric_dtype, metric_result = metric_types[m_idx], \
+                metric_dtypes[m_idx], metric_results[m_idx]
+            if metric_type == 'single_value_per_sample':
+                metric_fname = metric_result["sample_to_metric_fname"]
+                close_mmap_dataset_builder(metric_result["sample_to_metric_builder"],
+                                           metric_fname)
+                for m_value in metric_result["metric_to_sample_dict"]:
+                    if len(metric_result["metric_to_sample_dict"][m_value]) > 0:
+                        metric_fname = metric_result["metric_to_sample_fname"]
+                        with open(f"{metric_fname}_{m_value}.csv", 'a') as f:
+                            writer = csv.writer(f)
+                            writer.writerows(
+                                [metric_result["metric_to_sample_dict"][m_value]])
+                        metric_result["metric_to_sample_dict"][m_value] = []
+            elif metric_type == 'accumulate_value_over_samples':
+                if metric_result["metric_value"] is not None:
+                    metric_value_builder = create_mmap_dataset_builder(
+                        metric_result["metric_value_fname"],
+                        metric_dtype)
+                    metric_value_builder.add_item(
+                        metric_result["metric_value"].reshape(-1))
+                    close_mmap_dataset_builder(metric_value_builder,
+                                               metric_result["metric_value_fname"])
+
+    def run_map_helper(self, thread_id):
+        start_idx, end_idx = self.thread_splits[thread_id][0], \
+            self.thread_splits[thread_id][1]
+        logger.info(f"worker {self.worker_id} thread {thread_id}: start working " \
+            f"on data subset {start_idx} to {end_idx}")
+        thread_dataset = Subset(self.dataset, list(range(start_idx, end_idx)))
+        sampler = BatchSampler(SequentialSampler(thread_dataset),
+                               batch_size=self.batch_size,
+                               drop_last=False)
+        if self.collate_fn is None:
+            iterator = iter(
+                DataLoader(thread_dataset,
+                           batch_sampler=sampler,
+                           num_workers=0,
+                           pin_memory=False))
+        else:
+            iterator = iter(
+                DataLoader(thread_dataset,
+                           batch_sampler=sampler,
+                           num_workers=0,
+                           collate_fn=self.collate_fn,
+                           pin_memory=False))
+        if self.custom_map_init is None:
+            metric_results = self.init_metric_results(thread_id,
+                                                      self.metric_names,
+                                                      self.metric_types,
+                                                      self.metric_dtypes,
+                                                      self.save_path,
+                                                      self.worker_id)
+        else:
+            metric_results = self.custom_map_init(thread_id,
+                                                  self.metric_names,
+                                                  self.metric_types,
+                                                  self.metric_dtypes,
+                                                  self.save_path,
+                                                  self.worker_id)
+        total_sample = len(thread_dataset)
+        processed_sample = 0
+        start = time.time()
+        while True:
+            try:
+                data = next(iterator)
+                if self.custom_map_update is None:
+                    self.update_metric_results(data,
+                                               self.metric_types,
+                                               self.metric_functions,
+                                               metric_results)
+                else:
+                    self.custom_map_update(data,
+                                           self.metric_types,
+                                           self.metric_functions,
+                                           metric_results)
+                processed_sample += self.batch_size
+                duration = (time.time() - start) / 3600.0
+                remain_duration = duration * total_sample / processed_sample - duration
+                logger.info(
+                    f"worker {self.worker_id} thread {thread_id}: {processed_sample} " \
+                    f"out of {total_sample} processed in {duration:.2f} hr, " \
+                    f"estimated to finish in {remain_duration:.2f} hr")
+            except StopIteration:
+                logger.info(
+                    f"worker {self.worker_id} thread {thread_id}: reach end of file")
+                break
+        if self.custom_map_finalize is None:
+            self.finalize_metric_results(self.metric_types,
+                                         self.metric_dtypes,
+                                         metric_results)
+        else:
+            self.custom_map_finalize(self.metric_types,
+                                     self.metric_dtypes,
+                                     metric_results)
+        logger.info(f"worker {self.worker_id} thread {thread_id}: finished")
+
+    def run_map(self):
+        self.worker_splits, self.thread_splits = split_dataset(self.dataset,
+            self.num_workers, self.worker_id, self.num_threads)
+        if len(self.specific_threads) > 0:
+            threads_to_run = self.specific_threads
+        else:
+            threads_to_run = list(range(self.num_threads))
+        if self.num_threads > 1:
+            p = []
+            for thread in threads_to_run:
+                p.append(Process(target=self.run_map_helper, args=(thread, )))
+                p[thread].start()
+
+            for thread in threads_to_run:
+                p[thread].join()
+        else:
+            assert self.num_threads == 1
+            self.run_map_helper(0)
+
+    def get_metric_value_percentiles(self,
+                                     metric_name,
+                                     num_sample_per_value,
+                                     total_num_samples):
+        logger.info(f"Checking the value percentiles of metric {metric_name}...")
+        processed_samples = 0
+        current_percentile = 5
+        for key in sorted(num_sample_per_value.keys()):
+            processed_samples += num_sample_per_value[key]
+            if processed_samples >= total_num_samples * current_percentile / 100.0:
+                logger.info(
+                    f"Metric {metric_name} {current_percentile}th percentile: {key}")
+                current_percentile += 5
+
+    def merge_gather_map_stats(self,
+                               num_workers,
+                               num_threads,
+                               num_threads_reduce,
+                               t_idx_reduce,
+                               metric_save_path,
+                               metric_name,
+                               return_dict):
+        results = []
+        for w_idx in range(num_workers):
+            for t_idx in range(num_threads):
+                if (w_idx * num_threads + t_idx) % num_threads_reduce == t_idx_reduce:
+                    w_metric_save_path = f"{metric_save_path}/worker{w_idx}_thread{t_idx}/"
+                    w_sample_to_metric_fname = f"{w_metric_save_path}/{metric_name}_sample_to_metric"
+                    w_sample_to_metric = MMapIndexedDataset(w_sample_to_metric_fname,
+                                                            skip_warmup=True)
+                    unique_v = list(np.unique(w_sample_to_metric))
+                    sample_to_metric_count = len(w_sample_to_metric)
+                    logger.info(
+                        f"Finished gathering map stats from worker {w_idx} thread {t_idx}."
+                    )
+                    results.append([unique_v, sample_to_metric_count])
+        return_dict[t_idx_reduce] = results
+
+    def merge_sample_to_metric(self,
+                               t_idx_reduce,
+                               metric_save_path,
+                               metric_name,
+                               metric_value_dtype,
+                               map_worker_thread):
+        sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric_thread{t_idx_reduce}"
+        sample_to_metric_builder = create_mmap_dataset_builder(
+            sample_to_metric_fname,
+            metric_value_dtype)
+        for w_t in map_worker_thread:
+            w_metric_save_path = f"{metric_save_path}/worker{w_t[0]}_thread{w_t[1]}/"
+            w_sample_to_metric_fname = f"{w_metric_save_path}/{metric_name}_sample_to_metric"
+            w_data = MMapIndexedDataset(w_sample_to_metric_fname, skip_warmup=True)
+            for row in range(len(w_data)):
+                sample_to_metric_builder.add_item(
+                    torch.tensor(w_data[row].astype(np.int64),
+                                 dtype=torch.long))
+            logger.info(
+                f"Finished merge_sample_to_metric from worker {w_t[0]} thread {w_t[1]}.")
+        close_mmap_dataset_builder(sample_to_metric_builder, sample_to_metric_fname)
+
+    def merge_metric_to_sample(self,
+                               t_idx_reduce,
+                               metric_save_path,
+                               metric_name,
+                               sample_idx_dtype,
+                               metric_value_dtype,
+                               unique_metric_values,
+                               num_workers,
+                               num_threads):
+        index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample_thread{t_idx_reduce}"
+        index_to_sample_builder = create_mmap_dataset_builder(index_to_sample_fname,
+                                                              sample_idx_dtype)
+        index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric_thread{t_idx_reduce}"
+        index_to_metric_builder = create_mmap_dataset_builder(index_to_metric_fname,
+                                                              metric_value_dtype)
+        for unique_v in unique_metric_values:
+            samples = []
+            for w_idx in range(num_workers):
+                for t_idx in range(num_threads):
+                    w_metric_save_path = f"{metric_save_path}/worker{w_idx}_thread{t_idx}/"
+                    w_metric_to_sample_fname = f"{w_metric_save_path}/{metric_name}_metric_to_sample_{unique_v}.csv"
+                    if os.path.isfile(w_metric_to_sample_fname):
+                        with open(w_metric_to_sample_fname, 'r') as f:
+                            datareader = csv.reader(f)
+                            for row in datareader:
+                                samples += [int(x) for x in row]
+            index_to_sample_builder.add_item(torch.tensor(samples, dtype=torch.long))
+            index_to_metric_builder.add_item(torch.tensor([unique_v], dtype=torch.long))
+            logger.info(f"Finished reducing metric {metric_name} value {unique_v}.")
+        close_mmap_dataset_builder(index_to_sample_builder, index_to_sample_fname)
+        close_mmap_dataset_builder(index_to_metric_builder, index_to_metric_fname)
+
+    def merge_map_results(self,
+                          dataset,
+                          metric_names,
+                          metric_types,
+                          save_path,
+                          num_workers,
+                          num_threads,
+                          num_threads_reduce):
+        total_num_samples = len(dataset)
+        sample_idx_dtype = find_fit_int_dtype(0, total_num_samples - 1)
+        logger.info(
+            f"Total number of data samples: {total_num_samples}. Will use {sample_idx_dtype} to store the sample indexes."
+        )
+        for m_idx in range(len(metric_names)):
+            metric_name, metric_type = metric_names[m_idx], metric_types[m_idx]
+            if metric_type == 'single_value_per_sample':
+                metric_save_path = f"{save_path}/{metric_name}/"
+                sample_to_metric_count = 0
+                unique_metric_values = set([])
+                manager = Manager()
+                return_dict = manager.dict()
+                p = []
+                for t_idx_reduce in range(num_threads_reduce):
+                    p.append(
+                        Process(target=self.merge_gather_map_stats,
+                                args=(
+                                    num_workers,
+                                    num_threads,
+                                    num_threads_reduce,
+                                    t_idx_reduce,
+                                    metric_save_path,
+                                    metric_name,
+                                    return_dict,
+                                )))
+                    p[t_idx_reduce].start()
+                for t_idx_reduce in range(num_threads_reduce):
+                    p[t_idx_reduce].join()
+                for t_idx_reduce in range(num_threads_reduce):
+                    results = return_dict[t_idx_reduce]
+                    for res in results:
+                        unique_metric_values = unique_metric_values.union(set(res[0]))
+                        sample_to_metric_count += res[1]
+                value_max = max(unique_metric_values)
+                value_min = min(unique_metric_values)
+                assert sample_to_metric_count == total_num_samples, "The number of samples in map result files are not correct. It's possible that some map worker didn't finish successfully."
+                metric_value_dtype = find_fit_int_dtype(value_min, value_max)
+                logger.info(
+                    f"Metric {metric_name} has values between {value_min} and {value_max}. Will use {metric_value_dtype} to store the metric values."
+                )
+
+                # sample_to_metric
+                map_worker_thread = []
+                for w_idx in range(num_workers):
+                    for t_idx in range(num_threads):
+                        map_worker_thread.append([w_idx, t_idx])
+                thread_splits = split_index(0,
+                                            len(map_worker_thread),
+                                            num_threads_reduce)
+                p = []
+                for t_idx_reduce in range(num_threads_reduce):
+                    start_idx, end_idx = thread_splits[t_idx_reduce][0], thread_splits[t_idx_reduce][1]
+                    p.append(
+                        Process(target=self.merge_sample_to_metric,
+                                args=(
+                                    t_idx_reduce,
+                                    metric_save_path,
+                                    metric_name,
+                                    metric_value_dtype,
+                                    map_worker_thread[start_idx:end_idx],
+                                )))
+                    p[t_idx_reduce].start()
+                for t_idx_reduce in range(num_threads_reduce):
+                    p[t_idx_reduce].join()
+
+                sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric"
+                sample_to_metric_builder = create_mmap_dataset_builder(
+                    sample_to_metric_fname,
+                    metric_value_dtype)
+                for t_idx_reduce in range(num_threads_reduce):
+                    chunk_fname = f"{metric_save_path}/{metric_name}_sample_to_metric_thread{t_idx_reduce}"
+                    logger.info(f"Merging file {chunk_fname}")
+                    sample_to_metric_builder.merge_file_(chunk_fname)
+                close_mmap_dataset_builder(sample_to_metric_builder,
+                                           sample_to_metric_fname)
+                sample_to_metric = MMapIndexedDataset(sample_to_metric_fname,
+                                                      skip_warmup=True)
+                assert len(sample_to_metric) == total_num_samples
+
+                # metric_to_sample
+                unique_metric_values = list(sorted(unique_metric_values))
+                thread_splits = split_index(0,
+                                            len(unique_metric_values),
+                                            num_threads_reduce)
+                p = []
+                for t_idx_reduce in range(num_threads_reduce):
+                    start_idx, end_idx = thread_splits[t_idx_reduce][0], thread_splits[t_idx_reduce][1]
+                    p.append(
+                        Process(target=self.merge_metric_to_sample,
+                                args=(
+                                    t_idx_reduce,
+                                    metric_save_path,
+                                    metric_name,
+                                    sample_idx_dtype,
+                                    metric_value_dtype,
+                                    unique_metric_values[start_idx:end_idx],
+                                    num_workers,
+                                    num_threads,
+                                )))
+                    p[t_idx_reduce].start()
+                for t_idx_reduce in range(num_threads_reduce):
+                    p[t_idx_reduce].join()
+                index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample"
+                index_to_sample_builder = create_mmap_dataset_builder(
+                    index_to_sample_fname,
+                    sample_idx_dtype)
+                index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric"
+                index_to_metric_builder = create_mmap_dataset_builder(
+                    index_to_metric_fname,
+                    metric_value_dtype)
+                for t_idx_reduce in range(num_threads_reduce):
+                    chunk_is_fname = f"{metric_save_path}/{metric_name}_index_to_sample_thread{t_idx_reduce}"
+                    logger.info(f"Merging file {chunk_is_fname}")
+                    index_to_sample_builder.merge_file_(chunk_is_fname)
+                    chunk_im_fname = f"{metric_save_path}/{metric_name}_index_to_metric_thread{t_idx_reduce}"
+                    logger.info(f"Merging file {chunk_im_fname}")
+                    index_to_metric_builder.merge_file_(chunk_im_fname)
+                close_mmap_dataset_builder(index_to_sample_builder,
+                                           index_to_sample_fname)
+                close_mmap_dataset_builder(index_to_metric_builder,
+                                           index_to_metric_fname)
+                num_sample_per_value = {}
+                index_to_sample = MMapIndexedDataset(index_to_sample_fname,
+                                                     skip_warmup=True)
+                index_to_metric = MMapIndexedDataset(index_to_metric_fname,
+                                                     skip_warmup=True)
+                index_to_sample_merged_fname = f"{metric_save_path}/{metric_name}_index_to_sample_percentile_merged"
+                index_to_sample_merged_builder = create_mmap_dataset_builder(
+                    index_to_sample_merged_fname,
+                    sample_idx_dtype)
+                for v_idx in range(len(index_to_sample)):
+                    if v_idx > 0:
+                        assert index_to_metric[v_idx] > index_to_metric[v_idx - 1]
+                    num_sample_per_value[index_to_metric[v_idx][0]] = len(
+                        index_to_sample[v_idx])
+                assert sum(num_sample_per_value.values()) == total_num_samples
+                merge_step = len(index_to_sample) // 100
+                for v_idx in range(0, len(index_to_sample), merge_step):
+                    merged_samples = np.copy(
+                        np.concatenate(
+                            index_to_sample[v_idx:min(len(index_to_sample),
+                                                      (v_idx + merge_step))],
+                            axis=None))
+                    index_to_sample_merged_builder.add_item(
+                        torch.tensor(merged_samples.astype(np.int64),
+                                     dtype=torch.long))
+                    logger.info(
+                        f"Finished merging index_to_sample {v_idx} to {v_idx+merge_step}."
+                    )
+                close_mmap_dataset_builder(index_to_sample_merged_builder,
+                                           index_to_sample_merged_fname)
+                self.get_metric_value_percentiles(metric_name,
+                                                  num_sample_per_value,
+                                                  total_num_samples)
+            elif metric_type == 'accumulate_value_over_samples':
+                metric_save_path = f"{save_path}/{metric_name}/"
+                metric_value = None
+                for w_idx in range(num_workers):
+                    for t_idx in range(num_threads):
+                        w_metric_save_path = f"{metric_save_path}/worker{w_idx}_thread{t_idx}/"
+                        w_metric_value_fname = f"{w_metric_save_path}/{metric_name}_metric_value"
+                        w_metric_value = MMapIndexedDataset(w_metric_value_fname,
+                                                            skip_warmup=True)
+                        if metric_value is None:
+                            metric_value = np.copy(w_metric_value[0])
+                        else:
+                            metric_value += np.copy(w_metric_value[0])
+                value_max = int(max(metric_value))
+                value_min = int(min(metric_value))
+                metric_value_dtype = find_fit_int_dtype(value_min, value_max)
+                metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value"
+                metric_value_builder = create_mmap_dataset_builder(
+                    metric_value_fname,
+                    metric_value_dtype)
+                metric_value_builder.add_item(
+                    torch.tensor(metric_value.astype(np.int64),
+                                 dtype=torch.long))
+                close_mmap_dataset_builder(metric_value_builder, metric_value_fname)
+
+    def run_reduce(self):
+        if self.custom_reduce is None:
+            self.merge_map_results(self.dataset,
+                                   self.metric_names,
+                                   self.metric_types,
+                                   self.save_path,
+                                   self.num_workers,
+                                   self.num_threads,
+                                   self.num_threads_reduce)
+        else:
+            self.custom_reduce(self.dataset,
+                               self.metric_names,
+                               self.metric_types,
+                               self.save_path,
+                               self.num_workers,
+                               self.num_threads,
+                               self.num_threads_reduce)
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_sampler.py b/deepspeed/runtime/data_pipeline/data_sampling/data_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..24dbcfa93a0beac651dda165ac2c823c20eb5a05
--- /dev/null
+++ b/deepspeed/runtime/data_pipeline/data_sampling/data_sampler.py
@@ -0,0 +1,390 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+Part of this code was adopted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/data/data_samplers.py
+'''
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import os
+import numpy as np
+
+import deepspeed.comm as dist
+from deepspeed.utils import logger
+from deepspeed.accelerator import get_accelerator
+from ..constants import *
+from ..curriculum_scheduler import CurriculumScheduler
+from .indexed_dataset import MMapIndexedDataset
+from .utils import create_mmap_dataset_builder, close_mmap_dataset_builder, find_fit_int_dtype
+
+
+class DeepSpeedDataSampler(object):
+    def __init__(self,
+                 data_efficiency_config,
+                 one_epoch_total_samples,
+                 micro_batch_size,
+                 data_parallel_rank,
+                 data_parallel_size,
+                 data_parallel_group,
+                 gradient_accumulation_steps,
+                 global_rank,
+                 drop_last=True):
+        # Keep a copy of input params for later use.
+        self.data_efficiency_config = data_efficiency_config
+        self.one_epoch_total_samples = one_epoch_total_samples
+        self.index_dtype = find_fit_int_dtype(0, one_epoch_total_samples)
+        self.total_samples = one_epoch_total_samples * self.data_efficiency_config[
+            DATA_SAMPLING][DATA_SAMPLING_NUM_EPOCHS]
+        self.micro_batch_size = micro_batch_size
+        self.data_parallel_rank = data_parallel_rank
+        self.data_parallel_group = data_parallel_group
+        self.micro_batch_times_data_parallel_size = \
+            self.micro_batch_size * data_parallel_size
+        self.gradient_accumulation_steps = gradient_accumulation_steps
+        self.global_batch_size = self.micro_batch_times_data_parallel_size * \
+            self.gradient_accumulation_steps
+        self.global_rank = global_rank
+        self.drop_last = drop_last
+        self.np_rng = np.random.default_rng(
+            self.data_efficiency_config[DATA_EFFICIENCY_SEED])
+        self.state = {}
+        self.batch = []
+        self.consumed_samples = 0
+        if self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
+                CURRICULUM_LEARNING_ENABLED]:
+            self.curriculum_step = 0
+            self.current_difficulties = {}
+            self.data_cluster_paths = []
+            self.data_cluster_current_position = []
+            self.curriculum_schedulers = {}
+            self.curriculum_index_to_sample = {}
+            self.curriculum_index_to_metric = {}
+            self.difficulty_type = {}
+            self.clustering_type = {}
+            self.data_1epoch_size = None
+            if self.global_rank == 0:
+                self.data_clusters = []
+                self.data_cluster_sizes = []
+                cluster_path = self.data_efficiency_config[DATA_SAMPLING][
+                    CURRICULUM_LEARNING][CURRICULUM_LEARNING_CLUSTER_PATH]
+                if not os.path.exists(cluster_path):
+                    os.makedirs(cluster_path)
+            for metric in self.data_efficiency_config[DATA_SAMPLING][
+                    CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS]:
+                self.curriculum_schedulers[metric] = CurriculumScheduler(
+                    data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING]
+                    [CURRICULUM_LEARNING_METRICS][metric])
+                self.difficulty_type[metric] = data_efficiency_config[DATA_SAMPLING][
+                    CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS][metric][
+                        CURRICULUM_LEARNING_DIFFICULTY_TYPE]
+                self.clustering_type[metric] = data_efficiency_config[DATA_SAMPLING][
+                    CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS][metric][
+                        CURRICULUM_LEARNING_CLUSTERING_TYPE]
+                if self.global_rank == 0:
+                    if self.clustering_type[metric] != CURRICULUM_LEARNING_SINGLE_CLUSTER:
+                        self.curriculum_index_to_sample[metric] = MMapIndexedDataset(
+                            data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING]
+                            [CURRICULUM_LEARNING_METRICS][metric]
+                            [CURRICULUM_LEARNING_SAMPLE_PATH],
+                            skip_warmup=True)
+                        if self.difficulty_type[
+                                metric] == CURRICULUM_LEARNING_VALUE_BASED:
+                            self.curriculum_index_to_metric[metric] = MMapIndexedDataset(
+                                data_efficiency_config[DATA_SAMPLING]
+                                [CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS]
+                                [metric][CURRICULUM_LEARNING_METRIC_PATH],
+                                skip_warmup=True)
+
+        # Sanity checks.
+        assert self.total_samples > 0, \
+            'no sample to consume: {}'.format(self.total_samples)
+        assert self.micro_batch_size > 0
+        assert data_parallel_size > 0
+        assert self.data_parallel_rank < data_parallel_size, \
+            'data_parallel_rank should be smaller than data size: {}, ' \
+            '{}'.format(self.data_parallel_rank, data_parallel_size)
+
+    def __len__(self):
+        return self.total_samples
+
+    def set_custom_curriculum_learning_schedule(self, schedule_func_dict):
+        for metric in self.curriculum_schedulers:
+            if metric in schedule_func_dict:
+                self.curriculum_schedulers[metric].set_custom_get_difficulty(
+                    schedule_func_dict[metric])
+
+    def get_start_end_idx(self):
+        start_idx = self.data_parallel_rank * self.micro_batch_size
+        end_idx = start_idx + self.micro_batch_size
+        return start_idx, end_idx
+
+    def get_sample_based_on_metric_value(self, metric, value_start, value_end):
+        new_samples = None
+        for row in range(len(self.curriculum_index_to_sample[metric])):
+            if self.curriculum_index_to_metric[metric][
+                    row] <= value_end and self.curriculum_index_to_metric[metric][
+                        row] > value_start:
+                row_samples = np.copy(self.curriculum_index_to_sample[metric][row])
+                new_samples = row_samples if new_samples is None else np.concatenate(
+                    (new_samples,
+                     row_samples),
+                    axis=None)
+        return new_samples
+
+    def get_sample_based_on_metric_percentile(self,
+                                              metric,
+                                              percentile_start,
+                                              percentile_end):
+        new_samples = None
+        if self.data_1epoch_size is None:
+            self.data_1epoch_size = sum(
+                len(x) for x in self.curriculum_index_to_sample[metric])
+        max_percentile = self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
+            CURRICULUM_LEARNING_METRICS][metric][CURRICULUM_LEARNING_MAX_DIFFICULTY]
+        sample_per_percentile = self.data_1epoch_size // max_percentile
+        start_count = sample_per_percentile * percentile_start
+        end_count = sample_per_percentile * percentile_end
+        if percentile_end == max_percentile:
+            end_count = self.data_1epoch_size
+        current_count = 0
+        for row in range(len(self.curriculum_index_to_sample[metric])):
+            row_size = len(self.curriculum_index_to_sample[metric][row])
+            if current_count + row_size > start_count:
+                row_start = max(0, start_count - current_count)
+                if current_count + row_size <= end_count:
+                    row_end = row_size
+                else:
+                    row_end = end_count - current_count
+                row_samples = np.copy(
+                    self.curriculum_index_to_sample[metric][row][row_start:row_end])
+                new_samples = row_samples if new_samples is None else np.concatenate(
+                    (new_samples,
+                     row_samples),
+                    axis=None)
+            current_count += row_size
+            if current_count >= end_count:
+                break
+        return new_samples
+
+    def get_new_cluster(self, previous_difficulties):
+        cluster_fname = CURRICULUM_LEARNING_CLUSTER_PREFIX
+        for metric in self.curriculum_schedulers:
+            cluster_fname = f"{cluster_fname}_{metric}{self.current_difficulties[metric]}"
+        cluster_path = self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
+            CURRICULUM_LEARNING_CLUSTER_PATH]
+        cluster_path = f"{cluster_path}/{cluster_fname}"
+        if self.global_rank == 0:
+            new_cluster = None
+            need_clustering = 0
+            for metric in self.clustering_type:
+                if self.clustering_type[metric] != CURRICULUM_LEARNING_SINGLE_CLUSTER:
+                    need_clustering += 1
+            if need_clustering > 1:
+                for metric in self.curriculum_schedulers:
+                    if self.clustering_type[
+                            metric] == CURRICULUM_LEARNING_SINGLE_CLUSTER:
+                        metric_cluster = np.arange(start=0,
+                                                   stop=self.one_epoch_total_samples,
+                                                   step=1,
+                                                   dtype=self.index_dtype)
+                    else:
+                        if self.difficulty_type[
+                                metric] == CURRICULUM_LEARNING_VALUE_BASED:
+                            metric_cluster = self.get_sample_based_on_metric_value(
+                                metric,
+                                float('-inf'),
+                                self.current_difficulties[metric])
+                        elif self.difficulty_type[
+                                metric] == CURRICULUM_LEARNING_PERCENTILE_BASED:
+                            metric_cluster = self.get_sample_based_on_metric_percentile(
+                                metric,
+                                0,
+                                self.current_difficulties[metric])
+                    new_cluster = metric_cluster if new_cluster is None else \
+                        np.intersect1d(new_cluster, metric_cluster, assume_unique=True)
+                for cluster in self.data_clusters:
+                    new_cluster = np.setdiff1d(new_cluster,
+                                               cluster[0],
+                                               assume_unique=True)
+            else:
+                if len(self.data_clusters) == 0:
+                    new_cluster = np.arange(start=0,
+                                            stop=self.one_epoch_total_samples,
+                                            step=1,
+                                            dtype=self.index_dtype)
+                for metric in self.curriculum_schedulers:
+                    if self.clustering_type[metric] != CURRICULUM_LEARNING_SINGLE_CLUSTER:
+                        if self.difficulty_type[
+                                metric] == CURRICULUM_LEARNING_VALUE_BASED:
+                            new_cluster = self.get_sample_based_on_metric_value(
+                                metric,
+                                previous_difficulties[metric],
+                                self.current_difficulties[metric])
+                        elif self.difficulty_type[
+                                metric] == CURRICULUM_LEARNING_PERCENTILE_BASED:
+                            new_cluster = self.get_sample_based_on_metric_percentile(
+                                metric,
+                                previous_difficulties[metric],
+                                self.current_difficulties[metric])
+            if new_cluster is not None and len(new_cluster) > 0:
+                logger.info(
+                    f"new data cluster (previous_difficulties {previous_difficulties}, current_difficulties {self.current_difficulties}) with size {len(new_cluster)} generated."
+                )
+                self.np_rng.shuffle(new_cluster)
+                cluster_builder = create_mmap_dataset_builder(cluster_path,
+                                                              self.index_dtype)
+                cluster_builder.add_item_numpy(new_cluster)
+                close_mmap_dataset_builder(cluster_builder, cluster_path)
+                self.data_clusters.append(
+                    MMapIndexedDataset(cluster_path,
+                                       skip_warmup=True))
+                self.data_cluster_sizes.append(len(self.data_clusters[-1][0]))
+            else:
+                logger.info(
+                    f"new data cluster (previous_difficulties {previous_difficulties}, current_difficulties {self.current_difficulties}) has no matched data thus skipped."
+                )
+        dist.barrier(group=self.data_parallel_group)
+        if os.path.isfile(f"{cluster_path}.bin"):
+            self.data_cluster_paths.append(cluster_fname)
+            self.data_cluster_current_position.append(0)
+
+    def sample_from_clusters(self):
+        num_clusters = len(self.data_clusters)
+        weight_sum = sum(self.data_cluster_sizes)
+        weights = [x / weight_sum for x in self.data_cluster_sizes]
+        samples = self.np_rng.choice(num_clusters,
+                                     self.global_batch_size,
+                                     replace=True,
+                                     p=weights)
+        samples = np.bincount(samples, minlength=num_clusters)
+        return samples
+
+    def reshuffle_clusters(self, cidx):
+        cluster_fname = self.data_cluster_paths[cidx]
+        cluster_path = self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
+            CURRICULUM_LEARNING_CLUSTER_PATH]
+        cluster_path = f"{cluster_path}/{cluster_fname}"
+        cluster = np.copy(self.data_clusters[cidx][0])
+        self.np_rng.shuffle(cluster)
+        cluster_builder = create_mmap_dataset_builder(cluster_path, self.index_dtype)
+        cluster_builder.add_item_numpy(cluster)
+        close_mmap_dataset_builder(cluster_builder, cluster_path)
+        self.data_clusters[cidx] = MMapIndexedDataset(cluster_path, skip_warmup=True)
+
+    def get_sample_from_cluster(self, cidx, num_samples):
+        start_idx = self.data_cluster_current_position[cidx]
+        samples = list(
+            np.copy(self.data_clusters[cidx][0][start_idx:(start_idx + num_samples)]))
+        self.data_cluster_current_position[cidx] += num_samples
+        if len(samples) < num_samples:
+            num_samples_remained = num_samples - len(samples)
+            logger.info(f"reshuffling cluster {cidx}.")
+            self.reshuffle_clusters(cidx)
+            samples += list(np.copy(self.data_clusters[cidx][0][:num_samples_remained]))
+            self.data_cluster_current_position[cidx] = num_samples_remained
+        return samples
+
+    def get_next_global_batch(self):
+        if self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
+                CURRICULUM_LEARNING_ENABLED]:
+            self.curriculum_step += 1
+            new_cluster = False
+            previous_difficulties = {}
+            for metric in self.curriculum_schedulers:
+                next_difficulty = self.curriculum_schedulers[metric].update_difficulty(
+                    self.curriculum_step)
+                if metric not in self.current_difficulties or \
+                    next_difficulty != self.current_difficulties[metric]:
+                    new_cluster = True
+                if metric in self.current_difficulties:
+                    previous_difficulties[metric] = self.current_difficulties[metric]
+                else:
+                    if self.difficulty_type[metric] == CURRICULUM_LEARNING_VALUE_BASED:
+                        previous_difficulties[metric] = float('-inf')
+                    elif self.difficulty_type[
+                            metric] == CURRICULUM_LEARNING_PERCENTILE_BASED:
+                        previous_difficulties[metric] = 0
+                self.current_difficulties[metric] = next_difficulty
+            if new_cluster:
+                self.get_new_cluster(previous_difficulties)
+            if self.global_rank == 0:
+                samples_per_cluster = self.sample_from_clusters()
+                batch = []
+                for cidx in range(len(samples_per_cluster)):
+                    batch += self.get_sample_from_cluster(cidx,
+                                                          samples_per_cluster[cidx])
+                self.np_rng.shuffle(batch)
+                batch = torch.tensor(batch,
+                                     device=get_accelerator().current_device_name(),
+                                     dtype=torch.long).view(-1)
+            else:
+                batch = torch.empty(self.global_batch_size,
+                                    device=get_accelerator().current_device_name(),
+                                    dtype=torch.long)
+            dist.broadcast(batch, 0, group=self.data_parallel_group)
+            self.batch = batch.tolist()
+
+    def __iter__(self):
+        while self.consumed_samples <= self.total_samples:
+            if len(self.batch) == 0:
+                self.get_next_global_batch()
+            current_batch = self.batch[:self.micro_batch_times_data_parallel_size]
+            self.batch = self.batch[self.micro_batch_times_data_parallel_size:]
+            if len(current_batch) == self.micro_batch_times_data_parallel_size or \
+                (len(current_batch) > 0 and not self.drop_last):
+                start_idx, end_idx = self.get_start_end_idx()
+                yield current_batch[start_idx:end_idx]
+                self.consumed_samples += len(current_batch)
+                current_batch = []
+
+    def state_dict(self):
+        return {
+            CURRICULUM_LEARNING_BATCH: self.batch,
+            CURRICULUM_LEARNING_CONSUMED_SAMPLES: self.consumed_samples,
+            CURRICULUM_LEARNING_STEP: self.curriculum_step,
+            CURRICULUM_LEARNING_CURRENT_DIFFICULTIES: self.current_difficulties,
+            CURRICULUM_LEARNING_DATA_CLUSTER_PATHS: self.data_cluster_paths,
+            CURRICULUM_LEARNING_DATA_CLUSTER_CURRENT_POSITION:
+            self.data_cluster_current_position,
+            CURRICULUM_LEARNING_NP_RNG_STATE: np.random.get_state()
+        }
+
+    def load_state_dict(self, state_dict):
+        self.batch = state_dict[CURRICULUM_LEARNING_BATCH]
+        self.consumed_samples = state_dict[CURRICULUM_LEARNING_CONSUMED_SAMPLES]
+        self.curriculum_step = state_dict[CURRICULUM_LEARNING_STEP]
+        self.current_difficulties = state_dict[CURRICULUM_LEARNING_CURRENT_DIFFICULTIES]
+        self.data_cluster_paths = state_dict[CURRICULUM_LEARNING_DATA_CLUSTER_PATHS]
+        self.data_cluster_current_position = state_dict[
+            CURRICULUM_LEARNING_DATA_CLUSTER_CURRENT_POSITION]
+        np.random.set_state(state_dict[CURRICULUM_LEARNING_NP_RNG_STATE])
+        cluster_root_path = self.data_efficiency_config[DATA_SAMPLING][
+            CURRICULUM_LEARNING][CURRICULUM_LEARNING_CLUSTER_PATH]
+        # Backward compatibility: previously data_cluster_paths were stored as
+        # absolute paths. Now we changed it to just the file name so that even
+        # if user moved the cluster files, the checkpoint loading still works
+        # as long as user set the correct new CURRICULUM_LEARNING_CLUSTER_PATH
+        # in deepspeed json config.
+        for idx in range(len(self.data_cluster_paths)):
+            if '/' in self.data_cluster_paths[idx]:
+                self.data_cluster_paths[idx] = self.data_cluster_paths[idx].split(
+                    '/')[-1]
+        if self.global_rank == 0:
+            for cluster_fname in self.data_cluster_paths:
+                cluster_path = f"{cluster_root_path}/{cluster_fname}"
+                self.data_clusters.append(
+                    MMapIndexedDataset(cluster_path,
+                                       skip_warmup=True))
+                self.data_cluster_sizes.append(len(self.data_clusters[-1][0]))
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2a31663e923df62404f879fa6287075dadbd23d
--- /dev/null
+++ b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py
@@ -0,0 +1,645 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+Part of this code was adopted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/data/indexed_dataset.py
+'''
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# copied from fairseq/fairseq/data/indexed_dataset.py
+# Removed IndexedRawTextDataset since it relied on Fairseq dictionary
+# other slight modifications to remove fairseq dependencies
+# Added document index to index file and made it accessible.
+#    An empty sentence no longer separates documents.
+
+# Some of the fixes/improvements are adopted from
+# https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/main/megatron/data/indexed_dataset.py
+
+from functools import lru_cache
+import os
+import shutil
+import struct
+from itertools import accumulate
+
+import numpy as np
+import torch
+
+
+def __best_fitting_dtype(vocab_size=None):
+    if vocab_size is not None and vocab_size < 65500:
+        return np.uint16
+    else:
+        return np.int32
+
+
+def get_available_dataset_impl():
+    return ['lazy', 'cached', 'mmap']
+
+
+def infer_dataset_impl(path):
+    if IndexedDataset.exists(path):
+        with open(index_file_path(path), 'rb') as f:
+            magic = f.read(8)
+            if magic == IndexedDataset._HDR_MAGIC:
+                return 'cached'
+            elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]:
+                return 'mmap'
+            else:
+                return None
+    else:
+        print(f"Dataset does not exist: {path}")
+        print(
+            "Path should be a basename that both .idx and .bin can be appended to get full filenames."
+        )
+        return None
+
+
+def make_builder(out_file, impl, vocab_size=None):
+    if impl == 'mmap':
+        return MMapIndexedDatasetBuilder(out_file,
+                                         dtype=__best_fitting_dtype(vocab_size))
+    else:
+        return IndexedDatasetBuilder(out_file)
+
+
+def make_dataset(path, impl, skip_warmup=False):
+    if not IndexedDataset.exists(path):
+        print(f"Dataset does not exist: {path}")
+        print(
+            "Path should be a basename that both .idx and .bin can be appended to get full filenames."
+        )
+        return None
+    if impl == 'infer':
+        impl = infer_dataset_impl(path)
+    if impl == 'lazy' and IndexedDataset.exists(path):
+        return IndexedDataset(path)
+    elif impl == 'cached' and IndexedDataset.exists(path):
+        return IndexedCachedDataset(path)
+    elif impl == 'mmap' and MMapIndexedDataset.exists(path):
+        return MMapIndexedDataset(path, skip_warmup)
+    print(f"Unknown dataset implementation: {impl}")
+    return None
+
+
+def dataset_exists(path, impl):
+    if impl == 'mmap':
+        return MMapIndexedDataset.exists(path)
+    else:
+        return IndexedDataset.exists(path)
+
+
+def read_longs(f, n):
+    a = np.empty(n, dtype=np.int64)
+    f.readinto(a)
+    return a
+
+
+def write_longs(f, a):
+    f.write(np.array(a, dtype=np.int64))
+
+
+dtypes = {
+    1: np.uint8,
+    2: np.int8,
+    3: np.int16,
+    4: np.int32,
+    5: np.int64,
+    6: np.float64,
+    7: np.double,
+    8: np.uint16,
+    9: np.uint32,
+    10: np.uint64
+}
+
+
+def code(dtype):
+    for k in dtypes.keys():
+        if dtypes[k] == dtype:
+            return k
+    raise ValueError(dtype)
+
+
+def index_file_path(prefix_path):
+    return prefix_path + '.idx'
+
+
+def data_file_path(prefix_path):
+    return prefix_path + '.bin'
+
+
+def create_doc_idx(sizes):
+    doc_idx = [0]
+    for i, s in enumerate(sizes):
+        if s == 0:
+            doc_idx.append(i + 1)
+    return doc_idx
+
+
+class IndexedDataset(torch.utils.data.Dataset):
+    """Loader for IndexedDataset"""
+    _HDR_MAGIC = b'TNTIDX\x00\x00'
+
+    def __init__(self, path):
+        super().__init__()
+        self.path = path
+        self.data_file = None
+        self.read_index(path)
+
+    def read_index(self, path):
+        with open(index_file_path(path), 'rb') as f:
+            magic = f.read(8)
+            assert magic == self._HDR_MAGIC, (
+                'Index file doesn\'t match expected format. '
+                'Make sure that --dataset-impl is configured properly.'
+            )
+            version = f.read(8)
+            assert struct.unpack('<Q', version) == (1, )
+            code, self.element_size = struct.unpack('<QQ', f.read(16))
+            self.dtype = dtypes[code]
+            self._len, self.s = struct.unpack('<QQ', f.read(16))
+            self.doc_count = struct.unpack('<Q', f.read(8))
+            self.dim_offsets = read_longs(f, self._len + 1)
+            self.data_offsets = read_longs(f, self._len + 1)
+            self.sizes = read_longs(f, self.s)
+            self.doc_idx = read_longs(f, self.doc_count)
+
+    def read_data(self, path):
+        self.data_file = open(data_file_path(path), 'rb', buffering=0)
+
+    def check_index(self, i):
+        if i < 0 or i >= self._len:
+            raise IndexError('index out of range')
+
+    def __del__(self):
+        if self.data_file:
+            self.data_file.close()
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if not self.data_file:
+            self.read_data(self.path)
+        if isinstance(idx, int):
+            i = idx
+            self.check_index(i)
+            tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
+            a = np.empty(tensor_size, dtype=self.dtype)
+            self.data_file.seek(self.data_offsets[i] * self.element_size)
+            self.data_file.readinto(a)
+            return a
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError("Slices into indexed_dataset must be contiguous")
+            sizes = self.sizes[self.dim_offsets[start]:self.dim_offsets[stop]]
+            size = sum(sizes)
+            a = np.empty(size, dtype=self.dtype)
+            self.data_file.seek(self.data_offsets[start] * self.element_size)
+            self.data_file.readinto(a)
+            offsets = list(accumulate(sizes))
+            sents = np.split(a, offsets[:-1])
+            return sents
+
+    def __len__(self):
+        return self._len
+
+    def num_tokens(self, index):
+        return self.sizes[index]
+
+    def size(self, index):
+        return self.sizes[index]
+
+    @staticmethod
+    def exists(path):
+        return (os.path.exists(index_file_path(path))
+                and os.path.exists(data_file_path(path)))
+
+    @property
+    def supports_prefetch(self):
+        return False  # avoid prefetching to save memory
+
+
+class IndexedCachedDataset(IndexedDataset):
+    def __init__(self, path):
+        super().__init__(path)
+        self.cache = None
+        self.cache_index = {}
+
+    @property
+    def supports_prefetch(self):
+        return True
+
+    def prefetch(self, indices):
+        if all(i in self.cache_index for i in indices):
+            return
+        if not self.data_file:
+            self.read_data(self.path)
+        indices = sorted(set(indices))
+        total_size = 0
+        for i in indices:
+            total_size += self.data_offsets[i + 1] - self.data_offsets[i]
+        self.cache = np.empty(total_size, dtype=self.dtype)
+        ptx = 0
+        self.cache_index.clear()
+        for i in indices:
+            self.cache_index[i] = ptx
+            size = self.data_offsets[i + 1] - self.data_offsets[i]
+            a = self.cache[ptx:ptx + size]
+            self.data_file.seek(self.data_offsets[i] * self.element_size)
+            self.data_file.readinto(a)
+            ptx += size
+        if self.data_file:
+            # close and delete data file after prefetch so we can pickle
+            self.data_file.close()
+            self.data_file = None
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            i = idx
+            self.check_index(i)
+            tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
+            a = np.empty(tensor_size, dtype=self.dtype)
+            ptx = self.cache_index[i]
+            np.copyto(a, self.cache[ptx:ptx + a.size])
+            return a
+        elif isinstance(idx, slice):
+            # Hack just to make this work, can optimizer later if necessary
+            sents = []
+            for i in range(*idx.indices(len(self))):
+                sents.append(self[i])
+            return sents
+
+
+class IndexedDatasetBuilder(object):
+    element_sizes = {
+        np.uint8: 1,
+        np.int8: 1,
+        np.int16: 2,
+        np.int32: 4,
+        np.int64: 8,
+        np.float64: 4,
+        np.double: 8
+    }
+
+    def __init__(self, out_file, dtype=np.int32):
+        self.out_file = open(out_file, 'wb')
+        self.dtype = dtype
+        self.data_offsets = [0]
+        self.dim_offsets = [0]
+        self.sizes = []
+        self.element_size = self.element_sizes[self.dtype]
+        self.doc_idx = [0]
+
+    def add_item(self, tensor):
+        bytes = self.out_file.write(np.array(tensor.numpy(), dtype=self.dtype))
+        self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size)
+        for s in tensor.size():
+            self.sizes.append(s)
+        self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size()))
+
+    def end_document(self):
+        self.doc_idx.append(len(self.sizes))
+
+    def merge_file_(self, another_file):
+        index = IndexedDataset(another_file)
+        assert index.dtype == self.dtype
+
+        doc_offset = len(self.sizes)
+
+        begin = self.data_offsets[-1]
+        for data_offset in index.data_offsets[1:]:
+            self.data_offsets.append(begin + data_offset)
+        self.sizes.extend(index.sizes)
+        begin = self.dim_offsets[-1]
+        for dim_offset in index.dim_offsets[1:]:
+            self.dim_offsets.append(begin + dim_offset)
+        self.doc_idx.extend((doc_offset + index.doc_idx)[1:])
+
+        with open(data_file_path(another_file), 'rb') as f:
+            while True:
+                data = f.read(1024)
+                if data:
+                    self.out_file.write(data)
+                else:
+                    break
+
+    def finalize(self, index_file):
+        self.out_file.close()
+        index = open(index_file, 'wb')
+        index.write(b'TNTIDX\x00\x00')
+        index.write(struct.pack('<Q', 1))
+        index.write(struct.pack('<QQ', code(self.dtype), self.element_size))
+        index.write(struct.pack('<QQ', len(self.data_offsets) - 1, len(self.sizes)))
+        index.write(struct.pack('<Q', len(self.doc_idx)))
+        write_longs(index, self.dim_offsets)
+        write_longs(index, self.data_offsets)
+        write_longs(index, self.sizes)
+        write_longs(index, self.doc_idx)
+        index.close()
+
+
+def _warmup_mmap_file(path):
+    with open(path, 'rb') as stream:
+        while stream.read(100 * 1024 * 1024):
+            pass
+
+
+def exscan_from_cumsum_(arr):
+    # given an array holding the result of an inclusive scan (cumsum),
+    # convert to an exclusive scan (shift to the right)
+    # [10, 30, 35, 50] --> [0, 10, 30, 35]
+    if arr.size > 1:
+        arr[1:] = arr[:-1]
+    if arr.size > 0:
+        arr[0] = 0
+
+
+def get_pointers_with_total(sizes, elemsize, dtype):
+    """Return a numpy array of type np.dtype giving the byte offsets.
+
+    Multiplies values in the sizes array by elemsize (bytes),
+    and then computes an exclusive scan to get byte offsets.
+    Returns the total number of bytes as second item in a tuple.
+    """
+
+    # scale values in sizes array by elemsize to get sizes in bytes
+    pointers = np.array(sizes, dtype=dtype)
+    pointers *= elemsize
+    np.cumsum(pointers, axis=0, out=pointers)
+
+    # get total number of bytes from all sizes (last element)
+    bytes_last = pointers[-1] if len(sizes) > 0 else 0
+
+    # convert to byte offsets
+    exscan_from_cumsum_(pointers)
+
+    return pointers, bytes_last
+
+
+class MMapIndexedDataset(torch.utils.data.Dataset):
+    class Index(object):
+        _HDR_MAGIC = b'MMIDIDX\x00\x00'
+
+        @classmethod
+        def writer(cls, path, dtype):
+            class _Writer(object):
+                def __enter__(self):
+                    self._file = open(path, 'wb')
+
+                    self._file.write(cls._HDR_MAGIC)
+                    self._file.write(struct.pack('<Q', 1))
+                    self._file.write(struct.pack('<B', code(dtype)))
+
+                    return self
+
+                @staticmethod
+                def _get_pointers(sizes, npdtype):
+                    """Return a numpy array of byte offsets given a list of sizes.
+
+                    Multiplies values in the sizes array by dtype size (bytes),
+                    and then computes an exclusive scan to get byte offsets.
+                    """
+
+                    # compute element sizes in bytes
+                    pointers, _ = get_pointers_with_total(sizes, dtype().itemsize, npdtype)
+                    return pointers
+
+                def write(self, sizes, doc_idx):
+                    self._file.write(struct.pack('<Q', len(sizes)))
+                    self._file.write(struct.pack('<Q', len(doc_idx)))
+
+                    sizes32 = np.array(sizes, dtype=np.int32)
+                    self._file.write(sizes32.tobytes(order='C'))
+                    del sizes32
+
+                    pointers = self._get_pointers(sizes, np.int64)
+                    del sizes
+                    self._file.write(pointers.tobytes(order='C'))
+                    del pointers
+
+                    doc_idx = np.array(doc_idx, dtype=np.int64)
+                    self._file.write(doc_idx.tobytes(order='C'))
+
+                def __exit__(self, exc_type, exc_val, exc_tb):
+                    self._file.close()
+
+            return _Writer()
+
+        def __init__(self, path, skip_warmup=False):
+            with open(path, 'rb') as stream:
+                magic_test = stream.read(9)
+                assert self._HDR_MAGIC == magic_test, (
+                    'Index file doesn\'t match expected format. '
+                    'Make sure that --dataset-impl is configured properly.'
+                )
+                version = struct.unpack('<Q', stream.read(8))
+                assert (1, ) == version
+
+                dtype_code, = struct.unpack('<B', stream.read(1))
+                self._dtype = dtypes[dtype_code]
+                self._dtype_size = self._dtype().itemsize
+
+                self._len = struct.unpack('<Q', stream.read(8))[0]
+                self._doc_count = struct.unpack('<Q', stream.read(8))[0]
+                offset = stream.tell()
+
+            if not skip_warmup:
+                print("    warming up index mmap file...")
+                _warmup_mmap_file(path)
+
+            self._bin_buffer_mmap = np.memmap(path, mode='r', order='C')
+            self._bin_buffer = memoryview(self._bin_buffer_mmap)
+            print("    reading sizes...")
+            self._sizes = np.frombuffer(self._bin_buffer,
+                                        dtype=np.int32,
+                                        count=self._len,
+                                        offset=offset)
+            print("    reading pointers...")
+            self._pointers = np.frombuffer(self._bin_buffer,
+                                           dtype=np.int64,
+                                           count=self._len,
+                                           offset=offset + self._sizes.nbytes)
+            print("    reading document index...")
+            self._doc_idx = np.frombuffer(self._bin_buffer,
+                                          dtype=np.int64,
+                                          count=self._doc_count,
+                                          offset=offset + self._sizes.nbytes +
+                                          self._pointers.nbytes)
+
+        def __del__(self):
+            self._bin_buffer_mmap._mmap.close()
+            del self._bin_buffer_mmap
+
+        @property
+        def dtype(self):
+            return self._dtype
+
+        @property
+        def sizes(self):
+            return self._sizes
+
+        @property
+        def doc_idx(self):
+            return self._doc_idx
+
+        @lru_cache(maxsize=8)
+        def __getitem__(self, i):
+            return self._pointers[i], self._sizes[i]
+
+        def __len__(self):
+            return self._len
+
+    def __init__(self, path, skip_warmup=False):
+        super().__init__()
+
+        self._path = None
+        self._index = None
+        self._bin_buffer = None
+
+        self._do_init(path, skip_warmup)
+
+    def __getstate__(self):
+        return self._path
+
+    def __setstate__(self, state):
+        self._do_init(state)
+
+    def _do_init(self, path, skip_warmup):
+        self._path = path
+        self._index = self.Index(index_file_path(self._path), skip_warmup)
+
+        if not skip_warmup:
+            print("    warming up data mmap file...")
+            _warmup_mmap_file(data_file_path(self._path))
+        print("    creating numpy buffer of mmap...")
+        self._bin_buffer_mmap = np.memmap(data_file_path(self._path),
+                                          mode='r',
+                                          order='C')
+        print("    creating memory view of numpy buffer...")
+        self._bin_buffer = memoryview(self._bin_buffer_mmap)
+
+    def __del__(self):
+        self._bin_buffer_mmap._mmap.close()
+        del self._bin_buffer_mmap
+        del self._index
+
+    def __len__(self):
+        return len(self._index)
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            ptr, size = self._index[idx]
+            np_array = np.frombuffer(self._bin_buffer,
+                                     dtype=self._index.dtype,
+                                     count=size,
+                                     offset=ptr)
+            return np_array
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError("Slices into indexed_dataset must be contiguous")
+            ptr = self._index._pointers[start]
+            sizes = self._index._sizes[idx]
+            offsets = list(accumulate(sizes))
+            total_size = sum(sizes)
+            np_array = np.frombuffer(self._bin_buffer,
+                                     dtype=self._index.dtype,
+                                     count=total_size,
+                                     offset=ptr)
+            sents = np.split(np_array, offsets[:-1])
+            return sents
+
+    def get(self, idx, offset=0, length=None):
+        """ Retrieves a single item from the dataset with the option to only
+        return a portion of the item.
+
+        get(idx) is the same as [idx] but get() does not support slicing.
+        """
+        ptr, size = self._index[idx]
+        if length is None:
+            length = size - offset
+        ptr += offset * np.dtype(self._index.dtype).itemsize
+        np_array = np.frombuffer(self._bin_buffer,
+                                 dtype=self._index.dtype,
+                                 count=length,
+                                 offset=ptr)
+        return np_array
+
+    @property
+    def sizes(self):
+        return self._index.sizes
+
+    def size(self, index):
+        return self._index.sizes[index]
+
+    @property
+    def doc_idx(self):
+        return self._index.doc_idx
+
+    def get_doc_idx(self):
+        return self._index._doc_idx
+
+    def set_doc_idx(self, doc_idx_):
+        self._index._doc_idx = doc_idx_
+
+    @property
+    def supports_prefetch(self):
+        return False
+
+    @staticmethod
+    def exists(path):
+        return (os.path.exists(index_file_path(path))
+                and os.path.exists(data_file_path(path)))
+
+    @property
+    def dtype(self):
+        return self._index.dtype
+
+
+class MMapIndexedDatasetBuilder(object):
+    def __init__(self, out_file, dtype=np.int64):
+        self._data_file = open(out_file, 'wb')
+        self._dtype = dtype
+        self._sizes = []
+        self._doc_idx = [0]
+
+    def add_item(self, tensor):
+        np_array = np.array(tensor.numpy(), dtype=self._dtype)
+        self._data_file.write(np_array.tobytes(order='C'))
+        self._sizes.append(np_array.size)
+
+    def add_item_numpy(self, np_array):
+        if np_array.dtype != self._dtype:
+            np_array = np_array.astype(self._dtype)
+        self._data_file.write(np_array.tobytes(order='C'))
+        self._sizes.append(np_array.size)
+
+    def end_document(self):
+        self._doc_idx.append(len(self._sizes))
+
+    def merge_file_(self, another_file):
+        # Concatenate index
+        index = MMapIndexedDataset.Index(index_file_path(another_file))
+        assert index.dtype == self._dtype
+
+        total_len = len(index.sizes) + len(self._sizes)
+        print(
+            f"    concat {another_file} size={len(index.sizes)} for a total size of {total_len}"
+        )
+
+        offset = len(self._sizes)
+        self._sizes.extend(index.sizes)
+        self._doc_idx.extend((offset + index.doc_idx)[1:])
+
+        # Concatenate data
+        with open(data_file_path(another_file), 'rb') as f:
+            shutil.copyfileobj(f, self._data_file)
+
+    def finalize(self, index_file):
+        self._data_file.close()
+
+        with MMapIndexedDataset.Index.writer(index_file, self._dtype) as index:
+            index.write(self._sizes, self._doc_idx)
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/utils.py b/deepspeed/runtime/data_pipeline/data_sampling/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b1a3fb5945b7e8f88c9153ec825b6393cd0c78b
--- /dev/null
+++ b/deepspeed/runtime/data_pipeline/data_sampling/utils.py
@@ -0,0 +1,59 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+import math
+import numpy as np
+
+from deepspeed.utils import logger
+from .indexed_dataset import MMapIndexedDatasetBuilder
+
+
+def find_fit_int_dtype(min_value, max_value):
+    if min_value >= 0:
+        if max_value <= 255:
+            return np.uint8
+        elif max_value <= 65535:
+            return np.uint16
+        elif max_value <= 4294967295:
+            return np.uint32
+        else:
+            return np.uint64
+    else:
+        if max_value <= 127 and min_value >= -128:
+            return np.int8
+        elif max_value <= 32767 and min_value >= -32768:
+            return np.int16
+        elif max_value <= 2147483647 and min_value >= -2147483648:
+            return np.int32
+        else:
+            return np.int64
+
+
+def split_index(start_idx, end_idx, num_partitions):
+    partition_size = math.ceil((end_idx - start_idx) / num_partitions)
+    partitions = [[
+        start_idx + x * partition_size,
+        min(end_idx,
+            start_idx + (x + 1) * partition_size)
+    ] for x in range(num_partitions)]
+    return partitions
+
+
+def split_dataset(dataset, num_workers, worker_id, num_threads):
+    worker_splits = split_index(0, len(dataset), num_workers)
+    thread_splits = split_index(worker_splits[worker_id][0],
+                                worker_splits[worker_id][1],
+                                num_threads)
+    return worker_splits, thread_splits
+
+
+def create_mmap_dataset_builder(fname, dtype):
+    logger.info(f"Creating mmap dataset builder at {fname}.")
+    return MMapIndexedDatasetBuilder(f"{fname}.bin", dtype=dtype)
+
+
+def close_mmap_dataset_builder(builder, fname):
+    builder.end_document()
+    builder.finalize(f"{fname}.idx")
+    logger.info(f"Finalized mmap dataset builder at {fname}.")
diff --git a/deepspeed/runtime/dataloader.py b/deepspeed/runtime/dataloader.py
index acd21d9557fe2530b97999fd2f4e3f5719d22327..3734eedeb5c907a902d050d72fa88817d8bab50d 100644
--- a/deepspeed/runtime/dataloader.py
+++ b/deepspeed/runtime/dataloader.py
@@ -2,9 +2,15 @@
 Copyright 2019 The Microsoft DeepSpeed Team
 '''
 
-import torch
 from torch.utils.data import DataLoader, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
+from deepspeed.accelerator import get_accelerator
+
+from deepspeed.runtime.data_pipeline.data_sampling.data_sampler import DeepSpeedDataSampler
+from deepspeed.runtime.data_pipeline.constants import CURRICULUM_LEARNING, \
+    DATA_EFFICIENCY, DATA_SAMPLING_NUM_WORKERS
+from deepspeed.runtime.constants import GRADIENT_ACCUMULATION_STEPS, \
+    DATA_PARALLEL_GROUP, GLOBAL_RANK
 
 
 class RepeatingLoader:
@@ -42,24 +48,46 @@ class DeepSpeedDataLoader(object):
                  data_sampler=None,
                  data_parallel_world_size=None,
                  data_parallel_rank=None,
-                 dataloader_drop_last=False):
+                 dataloader_drop_last=False,
+                 deepspeed_dataloader_config={}):
+        self.deepspeed_dataloader_config = deepspeed_dataloader_config
         self.tput_timer = tput_timer
         self.batch_size = batch_size
-
-        if local_rank >= 0:
-            if data_sampler is None:
-                data_sampler = DistributedSampler(dataset=dataset,
-                                                  num_replicas=data_parallel_world_size,
-                                                  rank=data_parallel_rank)
-            device_count = 1
+        self.curriculum_learning_enabled = False
+        if CURRICULUM_LEARNING in deepspeed_dataloader_config:
+            self.curriculum_learning_enabled = deepspeed_dataloader_config[
+                CURRICULUM_LEARNING]
+
+        if self.curriculum_learning_enabled:
+            data_sampler = DeepSpeedDataSampler(
+                self.deepspeed_dataloader_config[DATA_EFFICIENCY],
+                len(dataset),
+                self.batch_size,
+                data_parallel_rank,
+                data_parallel_world_size,
+                self.deepspeed_dataloader_config[DATA_PARALLEL_GROUP],
+                self.deepspeed_dataloader_config[GRADIENT_ACCUMULATION_STEPS],
+                self.deepspeed_dataloader_config[GLOBAL_RANK],
+                drop_last=dataloader_drop_last)
+            device_count = get_accelerator().device_count()
+            num_local_io_workers = self.deepspeed_dataloader_config[
+                DATA_SAMPLING_NUM_WORKERS]
         else:
-            if data_sampler is None:
-                data_sampler = RandomSampler(dataset)
-            device_count = torch.cuda.device_count()
-            batch_size *= device_count
-
-        if num_local_io_workers is None:
-            num_local_io_workers = 2 * device_count
+            if local_rank >= 0:
+                if data_sampler is None:
+                    data_sampler = DistributedSampler(
+                        dataset=dataset,
+                        num_replicas=data_parallel_world_size,
+                        rank=data_parallel_rank)
+                device_count = 1
+            else:
+                if data_sampler is None:
+                    data_sampler = RandomSampler(dataset)
+                device_count = get_accelerator().device_count()
+                batch_size *= device_count
+
+            if num_local_io_workers is None:
+                num_local_io_workers = 2 * device_count
 
         self.num_local_io_workers = num_local_io_workers
         self.data_sampler = data_sampler
@@ -68,9 +96,15 @@ class DeepSpeedDataLoader(object):
         self.device_count = device_count
         self.batch_size = batch_size
         self.pin_memory = pin_memory
-        self.len = len(self.data_sampler)
         self.data = None
         self.dataloader_drop_last = dataloader_drop_last
+        self.post_process_func = None
+
+        if self.dataloader_drop_last:
+            self.len = len(self.data_sampler) // self.batch_size
+        else:
+            from math import ceil
+            self.len = ceil(len(self.data_sampler) / self.batch_size)
 
     def __iter__(self):
         self._create_dataloader()
@@ -82,27 +116,48 @@ class DeepSpeedDataLoader(object):
     def __next__(self):
         if self.tput_timer:
             self.tput_timer.start()
-        return next(self.data)
+        if self.curriculum_learning_enabled:
+            data = next(self.data_iterator)
+            if self.post_process_func is not None:
+                data = self.post_process_func(data, self.data_sampler.state_dict())
+            return data
+        else:
+            return next(self.data)
 
     def _create_dataloader(self):
-        if self.collate_fn is None:
-            self.dataloader = DataLoader(self.dataset,
-                                         batch_size=self.batch_size,
-                                         pin_memory=self.pin_memory,
-                                         sampler=self.data_sampler,
-                                         num_workers=self.num_local_io_workers,
-                                         drop_last=self.dataloader_drop_last)
+        if self.curriculum_learning_enabled:
+            if self.collate_fn is None:
+                self.dataloader = DataLoader(self.dataset,
+                                             pin_memory=self.pin_memory,
+                                             batch_sampler=self.data_sampler,
+                                             num_workers=self.num_local_io_workers)
+            else:
+                self.dataloader = DataLoader(self.dataset,
+                                             pin_memory=self.pin_memory,
+                                             batch_sampler=self.data_sampler,
+                                             collate_fn=self.collate_fn,
+                                             num_workers=self.num_local_io_workers)
+            self.data_iterator = iter(self.dataloader)
+            return self.dataloader
         else:
-            self.dataloader = DataLoader(self.dataset,
-                                         batch_size=self.batch_size,
-                                         pin_memory=self.pin_memory,
-                                         sampler=self.data_sampler,
-                                         collate_fn=self.collate_fn,
-                                         num_workers=self.num_local_io_workers,
-                                         drop_last=self.dataloader_drop_last)
-        self.data = (x for x in self.dataloader)
-
-        return self.dataloader
+            if self.collate_fn is None:
+                self.dataloader = DataLoader(self.dataset,
+                                             batch_size=self.batch_size,
+                                             pin_memory=self.pin_memory,
+                                             sampler=self.data_sampler,
+                                             num_workers=self.num_local_io_workers,
+                                             drop_last=self.dataloader_drop_last)
+            else:
+                self.dataloader = DataLoader(self.dataset,
+                                             batch_size=self.batch_size,
+                                             pin_memory=self.pin_memory,
+                                             sampler=self.data_sampler,
+                                             collate_fn=self.collate_fn,
+                                             num_workers=self.num_local_io_workers,
+                                             drop_last=self.dataloader_drop_last)
+            self.data = (x for x in self.dataloader)
+
+            return self.dataloader
 
 
 # DataLoader([(torch.randn(3, 3), torch.tensor(i % 2)) for i in range(10)], batch_size=2))
diff --git a/deepspeed/runtime/eigenvalue.py b/deepspeed/runtime/eigenvalue.py
old mode 100644
new mode 100755
index 490899bda5766a91eea10019ff080f167dcd2fc5..618ac00caff43f728c421d6b28bddca46dde6bb1
--- a/deepspeed/runtime/eigenvalue.py
+++ b/deepspeed/runtime/eigenvalue.py
@@ -1,3 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import torch
 from deepspeed.utils import log_dist
 import numpy as np
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 1567bd57b00e86849c0baa31f4907c360b87fac0..08cb1fd7276a1dc288881522769020e15f05096a 100644
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -5,12 +5,9 @@ Copyright 2019 The Microsoft DeepSpeed Team
 import os
 import re
 import stat
-import math
 import torch
-import warnings
 import hashlib
-import torch.distributed as dist
-from collections import defaultdict, OrderedDict
+from collections import defaultdict, OrderedDict, deque
 from shutil import copyfile
 
 from torch.nn.modules import Module
@@ -18,17 +15,18 @@ from torch.nn.parameter import Parameter
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 
-from typing import Callable, Dict, Optional, Union, Iterable
+from typing import Callable, Dict, Union, Iterable
 
 import deepspeed
 
-from deepspeed.runtime.utils import see_memory_usage, get_ma_status, DummyOptim
+from deepspeed.runtime.utils import see_memory_usage, DummyOptim
+from .zero.offload_config import OffloadDeviceEnum
 from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer
 from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 from deepspeed.runtime.zero.utils import is_zero_supported_optimizer, ZeRORuntimeException
-from deepspeed.runtime.activation_checkpointing import (
-    checkpointing as activation_checkpointing,
-)
+from deepspeed.runtime.zero.parameter_offload import DeepSpeedZeRoOffload
+from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION
+
 from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
 from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
 from deepspeed.runtime.bf16_optimizer import BF16_Optimizer
@@ -40,27 +38,48 @@ from deepspeed.runtime.config import DeepSpeedConfig, DEEPSPEED_OPTIMIZERS, \
 from deepspeed.runtime.dataloader import DeepSpeedDataLoader
 from deepspeed.runtime.constants import \
     ROUTE_TRAIN, ROUTE_PREDICT, ROUTE_EVAL, \
-    PLD_THETA, PLD_GAMMA, BFLOAT16, FP16
-from deepspeed.runtime.zero.constants import \
-    ZERO_OPTIMIZATION_OPTIMIZER_STATES, ZERO_OPTIMIZATION_GRADIENTS, ZERO_OPTIMIZATION_WEIGHTS
+    PLD_THETA, PLD_GAMMA, BFLOAT16, FP16, AMP, GRADIENT_ACCUMULATION_STEPS, \
+    DATA_PARALLEL_GROUP, GLOBAL_RANK
+from deepspeed.runtime.zero.config import ZeroStageEnum
+from deepspeed.compression import compression_scheduler
+from deepspeed.compression.constants import \
+    WEIGHT_QUANTIZE_IN_FORWARD_ENABLED, \
+    WEIGHT_QUANTIZATION, SHARED_PARAMETERS, \
+    WEIGHT_QUANTIZE_ENABLED, \
+    WEIGHT_QUANTIZE_GROUPS, \
+    WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE, \
+    WEIGHT_QUANTIZE_CHANGE_RATIO, \
+    WEIGHT_QUANTIZE_TYPE, \
+    WEIGHT_QUANTIZE_ROUNDING, \
+    WEIGHT_QUANTIZE_VERBOSE, \
+    WEIGHT_QUANTIZE_KERNEL
 from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT
 from deepspeed.runtime.sparse_tensor import SparseTensor
 
-import deepspeed.runtime.lr_schedules as lr_schedules
-import deepspeed.utils.groups as groups
-from deepspeed.runtime.utils import get_grad_norm
-from deepspeed.utils import logger, log_dist, init_distributed, instrument_w_nvtx
+from deepspeed.runtime import lr_schedules
+from deepspeed.utils import groups
+from deepspeed.utils import logger, log_dist, instrument_w_nvtx
 from deepspeed.utils.timer import ThroughputTimer, SynchronizedWallClockTimer
 from deepspeed.utils.debug import debug_extract_module_and_param_names
+from deepspeed.monitor.monitor import MonitorMaster
 from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
 from deepspeed.runtime.utils import clip_grad_norm_
 from deepspeed.runtime.eigenvalue import Eigenvalue
+from deepspeed.runtime.data_pipeline.constants import DATA_SAMPLING, \
+    DATA_ROUTING, DATA_SAMPLING_ENABLED, CURRICULUM_LEARNING, \
+    CURRICULUM_LEARNING_ENABLED, DATA_SAMPLING_NUM_WORKERS, RANDOM_LTD, \
+    RANDOM_LTD_ENABLED, RANDOM_LTD_LAYER_ID, RANDOM_LTD_LAYER_NUM, \
+    RANDOM_LTD_LAYER_TOKEN_LR_SCHEDULE, RANDOM_LTD_LAYER_TOKEN_LR_ENABLED, \
+    RANDOM_LTD_GLOBAL_BATCH_SIZE, RANDOM_LTD_MICRO_BATCH_SIZE, DATA_EFFICIENCY
 from deepspeed.runtime.data_pipeline.curriculum_scheduler import CurriculumScheduler
+from deepspeed.runtime.data_pipeline.data_routing.scheduler import RandomLTDScheduler
+from deepspeed.runtime.data_pipeline.data_routing.helper import remove_random_ltd_state_dict
+from deepspeed.runtime.data_pipeline.data_routing.basic_layer import RandomLayerTokenDrop
+
+from deepspeed.runtime.checkpoint_engine.torch_checkpoint_engine import TorchCheckpointEngine
 
 from .pipe.module import PipelineModule
 from .utils import ensure_directory_exists, get_ma_status
-from ..ops.op_builder import UtilsBuilder
-from ..ops.adam import DeepSpeedCPUAdam
 from ..ops.adam import FusedAdam
 from ..moe.sharded_moe import TopKGate, MOELayer
 from ..moe.layer import MoE
@@ -68,7 +87,15 @@ from ..moe.utils import is_moe_param
 from ..git_version_info import version
 
 from deepspeed.profiling.flops_profiler.profiler import FlopsProfiler
-from deepspeed.utils.logging import print_json_dist
+from deepspeed.utils.logging import print_json_dist, print_configuration
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import UtilsBuilder
+
+from deepspeed.inference.config import DtypeEnum
+
+# Set to torch's distributed package or deepspeed.comm based inside DeepSpeedEngine init
+dist = None
 
 MEMORY_OPT_ALLREDUCE_SIZE = 500000000
 
@@ -87,11 +114,12 @@ except ImportError:
 
 
 def split_half_float_double_sparse(tensors):
+    device_type = get_accelerator().device_name()
     supported_types = [
-        "torch.cuda.HalfTensor",
-        "torch.cuda.FloatTensor",
-        "torch.cuda.DoubleTensor",
-        "torch.cuda.BFloat16Tensor",
+        "torch.{}.HalfTensor".format(device_type),
+        "torch.{}.FloatTensor".format(device_type),
+        "torch.{}.DoubleTensor".format(device_type),
+        "torch.{}.BFloat16Tensor".format(device_type),
         SparseTensor.type()
     ]
 
@@ -106,13 +134,6 @@ def split_half_float_double_sparse(tensors):
     return buckets
 
 
-def print_configuration(args, name):
-    logger.info("{}:".format(name))
-    for arg in sorted(vars(args)):
-        dots = "." * (29 - len(arg))
-        logger.info("  {} {} {}".format(arg, dots, getattr(args, arg)))
-
-
 FORWARD_MICRO_TIMER = 'forward_microstep'
 FORWARD_GLOBAL_TIMER = 'forward'
 BACKWARD_MICRO_TIMER = 'backward_microstep'
@@ -185,7 +206,6 @@ class DeepSpeedEngine(Module):
         super(DeepSpeedEngine, self).__init__()
         self.dont_change_device = dont_change_device
         self.client_optimizer = optimizer
-        self.client_model_parameters = model_parameters
         self.client_lr_scheduler = lr_scheduler
         self.training_data = training_data
         self.collate_fn = collate_fn
@@ -205,14 +225,21 @@ class DeepSpeedEngine(Module):
         self.eigenvalue = None
         self.block_eigenvalue = None
         self.gas_boundary_ctr = 0
-        self.dist_backend = "nccl"
+        self.dist_backend = get_accelerator().communication_backend_name()
         self.has_moe_layers = False
         self.num_experts = []
         self.gate_modules = []
         self.moe_layers = []
         self._step_applied = False
         self._global_grad_norm = None
+        self.use_ds_comm = False  # False --> Use torch.dist, True --> Use ds.comm backend.
+
+        self.checkpoint_engine = None
+
+        global dist
+        from deepspeed import comm as dist
         self._is_gradient_accumulation_boundary = None
+        self.scale_wrt_gas = None
 
         # for debug purposes - can then debug print: debug_get_module_name(module)
         debug_extract_module_and_param_names(model)
@@ -224,16 +251,22 @@ class DeepSpeedEngine(Module):
         if self.config is None and config_params is not None:
             self.config = config_params
 
-        if dist_init_required is None:
-            dist_init_required = not dist.is_initialized()
-
-        if dist_init_required is False:
-            assert (
-                dist.is_initialized() is True
-            ), "Torch distributed not initialized. Please set dist_init_required to True or initialize before calling deepspeed.initialize()"
+        from deepspeed.comm import supported_torch_version
+        # This supported_torch_version check is for torch1.2 compatibility only
+        if supported_torch_version:
+            dist.init_distributed(dist_backend=self.dist_backend,
+                                  dist_init_required=dist_init_required)
         else:
-            # Initialize torch distributed if needed
-            init_distributed(dist_backend=self.dist_backend)
+            if dist_init_required is None:
+                dist_init_required = not dist.is_initialized()
+
+            if dist_init_required is False:
+                assert (
+                    dist.is_initialized() is True
+                ), "Torch distributed not initialized. Please set dist_init_required to True or initialize before calling deepspeed.initialize()"
+            else:
+                if not dist.is_initialized():
+                    dist.init_process_group(backend=self.dist_backend)
 
         self._do_args_sanity_check(args)
         self._configure_with_arguments(args, mpu)
@@ -241,14 +274,17 @@ class DeepSpeedEngine(Module):
         see_memory_usage(f"DeepSpeed Engine: After args sanity test",
                          force=self.memory_breakdown())
         if mpu is not None:
-            assert not self.elasticity_enabled(), (
-                "Elasticity is not currently supported" " with model parallelism."
-            )
+            if self.elasticity_enabled():
+                if not self.is_elastic_model_parallel_supported():
+                    assert not self.elasticity_enabled(), (
+                        "Elasticity is not currently supported" " with model parallelism."
+                    )
 
         self._set_distributed_vars(args)
 
-        if self.tensorboard_enabled() and self.global_rank == 0:
-            self.summary_writer = self.get_summary_writer()
+        dist.configure(self._config)
+
+        self.monitor = MonitorMaster(self._config.monitor_config)
 
         see_memory_usage(
             f"DeepSpeed Engine: Before configure distributed model",
@@ -268,15 +304,13 @@ class DeepSpeedEngine(Module):
         self.timers = SynchronizedWallClockTimer()
         # Throughput timer
         self.tput_timer = ThroughputTimer(
-            batch_size=self.train_micro_batch_size_per_gpu(),
-            num_workers=self.dp_world_size,
+            batch_size=self.train_batch_size(),
             steps_per_output=self.steps_per_print(),
             monitor_memory=False,
         )
 
-        if dist.get_rank() == 0:
-            logger.info(
-                f"DeepSpeed Flops Profiler Enabled: {self.flops_profiler_enabled()}")
+        log_dist(f"DeepSpeed Flops Profiler Enabled: {self.flops_profiler_enabled()}",
+                 ranks=[0])
 
         if self.flops_profiler_enabled():
             self.flops_profiler = FlopsProfiler(self.module, self)
@@ -290,15 +324,23 @@ class DeepSpeedEngine(Module):
         self.optimizer = None
         self.basic_optimizer = None
         self.lr_scheduler = None
-        if model_parameters or optimizer:
+        has_optimizer = False
+
+        if optimizer or self.optimizer_name():
+            has_optimizer = True
+        # If no parameters given by init default to module parameters
+        if model_parameters is None:
+            model_parameters = self.module.parameters()
+
+        if has_optimizer:
             self._configure_optimizer(optimizer, model_parameters)
             self._configure_lr_scheduler(lr_scheduler)
             self._report_progress(0)
         elif self.zero_optimization():
             # no optim selected but zero is enabled
             self.optimizer = self._configure_zero_optimizer(optimizer=None)
-
-        self._get_model_parameters()
+        elif self.bfloat16_enabled():
+            self.optimizer = self._configure_bf16_optimizer(optimizer=None)
 
         # Bookkeeping for sparse support
         self.sparse_tensor_module_names = set()
@@ -313,7 +355,8 @@ class DeepSpeedEngine(Module):
 
         self.save_non_zero_checkpoint = False
         self.save_zero_checkpoint = False
-        self._configure_checkpointing(dist_init_required)
+        if not isinstance(self.optimizer, DeepSpeedZeRoOffload):
+            self._configure_checkpointing(dist_init_required)
 
         if self.eigenvalue_enabled():
             self.eigenvalue = self._configure_eigenvalue()
@@ -321,8 +364,17 @@ class DeepSpeedEngine(Module):
         if self.pld_enabled():
             self.progressive_layer_drop = self._configure_progressive_layer_drop()
 
-        if self.curriculum_enabled():
-            self.curriculum_scheduler = self._configure_curriculum_scheduler()
+        if self.curriculum_enabled_legacy():
+            self.curriculum_scheduler_legacy = self._configure_curriculum_scheduler_legacy(
+            )
+
+        if self.random_ltd_enabled():
+            random_ltd_config = self.random_ltd_config()
+            random_ltd_config[RANDOM_LTD_GLOBAL_BATCH_SIZE] = self.train_batch_size()
+            random_ltd_config[
+                RANDOM_LTD_MICRO_BATCH_SIZE] = self.train_micro_batch_size_per_gpu()
+            self.random_ltd_scheduler = self._configure_random_ltd_scheduler(
+                random_ltd_config)
 
         # Engine timers
 
@@ -341,6 +393,10 @@ class DeepSpeedEngine(Module):
         self.flatten = util_ops.flatten
         self.unflatten = util_ops.unflatten
 
+    def destroy(self):
+        if self.optimizer is not None and hasattr(self.optimizer, 'destroy'):
+            self.optimizer.destroy()
+
     def _get_model_parameters(self):
         if self.autotuning_profile_model_info():
             self.autotuning_model_info = {}
@@ -367,7 +423,6 @@ class DeepSpeedEngine(Module):
 
     def get_batch_info(self):
         """Get all training batch related settings.
-
         Returns:
             train_batch_size (int): The effective training batch size. This is the amount of data
                 samples that leads to one step of model update.
@@ -403,10 +458,18 @@ class DeepSpeedEngine(Module):
         self._config.train_batch_size = train_batch_size
         self._config.gradient_accumulation_steps = new_gas
 
+    def set_data_post_process_func(self, post_process_func):
+        if self.training_dataloader is not None:
+            self.training_dataloader.post_process_func = post_process_func
+
+    def set_custom_curriculum_learning_schedule(self, schedule_func_dict):
+        if self.training_dataloader is not None and self.curriculum_learning_enabled():
+            self.training_dataloader.data_sampler.set_custom_curriculum_learning_schedule(
+                schedule_func_dict)
+
     def get_global_grad_norm(self) -> float:
         """Return the 2-norm of all gradients. If there is model parallelism,
         the norm will be global.
-
         The computed norm will be cached and reused until the next step() pass.
         .. note::
             In the presence of model parallelism, this is a collective call
@@ -416,6 +479,22 @@ class DeepSpeedEngine(Module):
         """
         return self._global_grad_norm
 
+    def __getattr__(self, name):
+        """
+        Pass through attributes defined in the model if they are not overridden by ds-engine.
+        """
+
+        _module = {}
+        if "module" in self.__dict__:
+            _module = self.__dict__['module']
+        if name in dir(self):
+            return getattr(self, name)
+        elif name in dir(_module):
+            return getattr(_module, name)
+        else:
+            raise AttributeError(
+                f"'{type(self).__name__}' object has no attribute '{name}'")
+
     def checkpoint_tag_validation_enabled(self):
         return self._config.checkpoint_tag_validation_enabled
 
@@ -425,6 +504,14 @@ class DeepSpeedEngine(Module):
     def elasticity_enabled(self):
         return self._config.elasticity_enabled
 
+    def is_elastic_model_parallel_supported(self):
+        if self.elasticity_enabled():
+            # Add code for finding number of GPUs per node automatically
+            if self._config.num_gpus_per_node % self._config.elastic_model_parallel_size == 0:
+                return True
+            else:
+                return False
+
     def pld_enabled(self):
         return self._config.pld_enabled
 
@@ -461,59 +548,64 @@ class DeepSpeedEngine(Module):
     def eigenvalue_layer_num(self):
         return self._config.eigenvalue_layer_num
 
-    def curriculum_enabled(self):
-        return self._config.curriculum_enabled
+    def curriculum_enabled_legacy(self):
+        return self._config.curriculum_enabled_legacy
 
-    def curriculum_params(self):
-        return self._config.curriculum_params
+    def curriculum_params_legacy(self):
+        return self._config.curriculum_params_legacy
 
-    def tensorboard_enabled(self):
-        return self._config.tensorboard_enabled
+    def data_efficiency_enabled(self):
+        return self._config.data_efficiency_enabled
 
-    def tensorboard_output_path(self):
-        return self._config.tensorboard_output_path
+    def data_efficiency_config(self):
+        return self._config.data_efficiency_config
 
-    def tensorboard_job_name(self):
-        return self._config.tensorboard_job_name
+    def data_sampling_enabled(self):
+        return self._config.data_efficiency_config[DATA_SAMPLING][DATA_SAMPLING_ENABLED]
 
-    def get_summary_writer(
-            self,
-            name="DeepSpeedJobName",
-            base=os.path.join(os.path.expanduser("~"),
-                              "tensorboard"),
-    ):
-        if self.tensorboard_output_path():
-            base_dir = self.tensorboard_output_path()
-            job_name = self.tensorboard_job_name()
-            log_dir = os.path.join(base_dir, job_name)
-        else:
-            if self.tensorboard_job_name():
-                name = self.tensorboard_job_name()
-
-            # Infrastructure-specific job-id
-            if "DLWS_JOB_ID" in os.environ:
-                infra_job_id = os.environ["DLWS_JOB_ID"]
-            elif "DLTS_JOB_ID" in os.environ:
-                infra_job_id = os.environ["DLTS_JOB_ID"]
-            else:
-                infra_job_id = "unknown-job-id"
+    def data_sampling_config(self):
+        return self._config.data_efficiency_config[DATA_SAMPLING]
 
-            summary_writer_dir_name = os.path.join(infra_job_id, "logs")
-            log_dir = os.path.join(base, summary_writer_dir_name, name)
+    def curriculum_learning_enabled(self):
+        return self._config.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
+            CURRICULUM_LEARNING_ENABLED]
 
-        os.makedirs(log_dir, exist_ok=True)
-        try:
-            # torch.utils.tensorboard will fail if `tensorboard` is not available,
-            # see their docs for more details: https://pytorch.org/docs/1.8.0/tensorboard.html
-            import tensorboard
-        except ImportError:
-            print(
-                'If you want to use tensorboard logging please `pip install tensorboard`'
-            )
-            raise
-        from torch.utils.tensorboard import SummaryWriter
+    def curriculum_learning_config(self):
+        return self._config.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING]
+
+    def random_ltd_enabled(self):
+        return self._config.data_efficiency_config[DATA_ROUTING][RANDOM_LTD][
+            RANDOM_LTD_ENABLED]
 
-        return SummaryWriter(log_dir=log_dir)
+    def random_ltd_config(self):
+        return self._config.data_efficiency_config[DATA_ROUTING][RANDOM_LTD]
+
+    def random_ltd_initialize(self):
+        assert self.random_ltd_enabled()
+        random_ltd_config = self.random_ltd_config()
+        random_ltd_queue = deque(
+            [x for x in sorted(random_ltd_config[RANDOM_LTD_LAYER_ID])])
+        count = 0
+        for name, layer in self.module.named_modules():
+            if isinstance(layer, RandomLayerTokenDrop):
+                if len(random_ltd_queue) != 0 and str(
+                        random_ltd_queue[0]) in name:  ###[1,2,3]
+                    layer.init_config(random_ltd_config,
+                                      self.random_ltd_scheduler,
+                                      count)
+                    random_ltd_queue.popleft()
+                    count += 1
+
+        if random_ltd_config[RANDOM_LTD_LAYER_NUM] != count:
+            raise ValueError(
+                f'random_ltd_layer_num {random_ltd_config[RANDOM_LTD_LAYER_NUM]} must be \
+                equivalent to the len of random_ltd_layer_id {count}')
+
+        if random_ltd_config[RANDOM_LTD_LAYER_TOKEN_LR_SCHEDULE][
+                RANDOM_LTD_LAYER_TOKEN_LR_ENABLED]:
+            assert self.client_lr_scheduler is None
+            raise ValueError(f'not yet support')
+            #self.lr_scheduler = lr_schedules.WarmupLayerTokenDecayLR(self.optimizer, self.random_ltd_scheduler)
 
     def wall_clock_breakdown(self):
         return self._config.wall_clock_breakdown
@@ -601,18 +693,24 @@ class DeepSpeedEngine(Module):
 
     def quantize_training(self):
         return (
-            self._config.quantize_training_enabled,
-            self._config.quantize_target_bits,
-            self._config.quantize_start_bits,
-            self._config.quantize_period,
-            self._config.quantize_offset,
-            self._config.quantize_groups,
-            self._config.fp16_mixed_quantize,
-            self._config.quantize_change_rate,
-            self._config.quantize_type,
-            self._config.quantize_rounding,
-            self._config.quantize_verbose,
-            self._config.use_quantizer_kernel,
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
+            [WEIGHT_QUANTIZE_IN_FORWARD_ENABLED],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
+            [WEIGHT_QUANTIZE_ENABLED],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
+            [WEIGHT_QUANTIZE_GROUPS],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
+            [WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
+            [WEIGHT_QUANTIZE_CHANGE_RATIO],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
+            [WEIGHT_QUANTIZE_TYPE],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
+            [WEIGHT_QUANTIZE_ROUNDING],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
+            [WEIGHT_QUANTIZE_VERBOSE],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
+            [WEIGHT_QUANTIZE_KERNEL],
         )
 
     def zero_optimization(self):
@@ -633,8 +731,18 @@ class DeepSpeedEngine(Module):
     def zero_offload_param(self):
         return self._config.zero_config.offload_param
 
+    def zero_use_cpu_optimizer(self):
+        if self._config.zero_config.offload_optimizer is not None:
+            return self._config.zero_config.offload_optimizer.device in [
+                OffloadDeviceEnum.cpu,
+                OffloadDeviceEnum.nvme
+            ]
+        return False
+
     def zero_cpu_offload(self):
-        return self._config.zero_config.offload_optimizer is not None
+        if self._config.zero_config.offload_optimizer is not None:
+            return self._config.zero_config.offload_optimizer.device == OffloadDeviceEnum.cpu
+        return False
 
     def zero_sub_group_size(self):
         return self._config.zero_config.sub_group_size
@@ -649,10 +757,10 @@ class DeepSpeedEngine(Module):
         return self._config.zero_config.allgather_bucket_size
 
     def zero_optimization_partition_gradients(self):
-        return self.zero_optimization_stage() >= ZERO_OPTIMIZATION_GRADIENTS
+        return self.zero_optimization_stage() >= ZeroStageEnum.gradients
 
     def zero_optimization_partition_weights(self):
-        return self.zero_optimization_stage() >= ZERO_OPTIMIZATION_WEIGHTS
+        return self.zero_optimization_stage() >= ZeroStageEnum.weights
 
     def zero_contiguous_gradients(self):
         return self._config.zero_config.contiguous_gradients
@@ -675,6 +783,9 @@ class DeepSpeedEngine(Module):
     def zero_param_persistence_threshold(self):
         return self._config.zero_config.param_persistence_threshold
 
+    def zero_model_persistence_threshold(self):
+        return self._config.zero_config.model_persistence_threshold
+
     def zero_gather_16bit_weights_on_model_save(self):
         return self._config.zero_config.gather_16bit_weights_on_model_save
 
@@ -702,12 +813,21 @@ class DeepSpeedEngine(Module):
     def amp_params(self):
         return self._config.amp_params
 
+    def fp16_auto_cast(self):
+        return self._config.fp16_auto_cast
+
     def loss_scale(self):
         return self._config.loss_scale
 
     def gradient_accumulation_steps(self):
         return self._config.gradient_accumulation_steps
 
+    def use_node_local_storage(self):
+        return self._config.use_node_local_storage
+
+    def load_universal_checkpoint(self):
+        return self._config.load_universal_checkpoint
+
     @property
     def communication_data_type(self):
         res = self._config.communication_data_type
@@ -756,39 +876,71 @@ class DeepSpeedEngine(Module):
     def aio_config(self):
         return self._config.aio_config
 
+    def get_data_types(self):
+        model_dtype = torch.float32
+        if self.fp16_enabled():
+            model_dtype = torch.float16
+        elif self.bfloat16_enabled():
+            model_dtype = torch.bfloat16
+
+        if self._config.grad_accum_dtype == None:
+            if model_dtype == torch.bfloat16 and not self.zero_optimization():
+                grad_accum_dtype = torch.float32
+            else:
+                grad_accum_dtype = model_dtype
+        else:
+            grad_accum_dtype = DtypeEnum(self._config.grad_accum_dtype).value
+
+        return (model_dtype, grad_accum_dtype)
+
     def _configure_lr_scheduler(self, client_lr_scheduler):
         # First check for scheduler in json configuration
         lr_scheduler = self._scheduler_from_config(self.optimizer)
         if lr_scheduler:
-            if self.global_rank == 0:
-                logger.info(
-                    f"DeepSpeed using configured LR scheduler = {self.scheduler_name()}")
+            log_dist(
+                f"DeepSpeed using configured LR scheduler = {self.scheduler_name()}",
+                ranks=[0])
             self.lr_scheduler = lr_scheduler
         else:
             if isinstance(client_lr_scheduler, Callable):
-                if self.global_rank == 0:
-                    logger.info('DeepSpeed using client callable to create LR scheduler')
+                log_dist('DeepSpeed using client callable to create LR scheduler',
+                         ranks=[0])
                 self.lr_scheduler = client_lr_scheduler(self.basic_optimizer)
             else:
-                if self.global_rank == 0:
-                    logger.info('DeepSpeed using client LR scheduler')
+                log_dist('DeepSpeed using client LR scheduler', ranks=[0])
                 self.lr_scheduler = client_lr_scheduler
 
         log_dist(f'DeepSpeed LR Scheduler = {self.lr_scheduler}', ranks=[0])
 
     def _configure_checkpointing(self, dist_init_required):
+        self.checkpoint_engine = TorchCheckpointEngine()
+
+        if self._config is not None and self._config.nebula_config.enabled:
+            try:
+                from deepspeed.runtime.checkpoint_engine.nebula_checkpoint_engine import \
+                    NebulaCheckpointEngine
+                self.checkpoint_engine = NebulaCheckpointEngine(
+                    config_params=self._config.nebula_config)
+            except ImportError as err:
+                logger.error(
+                    f"No torch_nebula was found! Will fall back to torch.save. Details: {err}"
+                )
+                self.checkpoint_engine = TorchCheckpointEngine()
 
         dp_rank = self.global_rank
         if self.mpu:
             dp_rank = self.mpu.get_data_parallel_rank()
 
+        rank = self.local_rank if self.use_node_local_storage() else dp_rank
+
         # only the first data parallel process needs to store the model checkpoint
+        # if you want to use node local storage this must be done by rank 0 on each
+        # node
         self.save_non_zero_checkpoint = (
-            dp_rank == 0) or self.zero_optimization_partition_weights()
+            rank == 0) or self.zero_optimization_partition_weights()
 
         if self.zero_optimization() or self.bfloat16_enabled():
-            param_rank = torch.distributed.get_rank(
-                group=self.optimizer.dp_process_group)
+            param_rank = dist.get_rank(group=self.optimizer.dp_process_group)
 
             # Only the first parameter parallel process needs to store the
             # optimizer state checkpoints for zero
@@ -817,14 +969,14 @@ class DeepSpeedEngine(Module):
             args,
             'device_rank') else self.local_rank
         if device_rank >= 0:
-            torch.cuda.set_device(device_rank)
-            self.device = torch.device("cuda", device_rank)
+            get_accelerator().set_device(device_rank)
+            self.device = torch.device(get_accelerator().device_name(), device_rank)
             self.world_size = dist.get_world_size()
             self.global_rank = dist.get_rank()
         else:
             self.world_size = 1
             self.global_rank = 0
-            self.device = torch.device("cuda")
+            self.device = torch.device(get_accelerator().device_name())
 
     # Configure based on command line arguments
     def _configure_with_arguments(self, args, mpu):
@@ -863,7 +1015,7 @@ class DeepSpeedEngine(Module):
             args.deepspeed_config = args.deepscale_config
 
         assert "LOCAL_RANK" in os.environ or "OMPI_COMM_WORLD_LOCAL_RANK" in os.environ, "DeepSpeed requires the LOCAL_RANK environment " \
-            "variable, it is set by the deepspeed launcher, deepspeed.init_distributed, or the torch.distributed launcher. If using a " \
+            "variable, it is set by the deepspeed launcher, deepspeed.init_distributed, or the torch's launcher. If using a " \
             "different launcher please ensure LOCAL_RANK is set prior to initializing deepspeed."
 
         if hasattr(args, 'local_rank') and args.local_rank != None:
@@ -881,21 +1033,30 @@ class DeepSpeedEngine(Module):
                     args, "deepspeed_config") and args.deepspeed_config is not None
             ), "DeepSpeed requires --deepspeed_config to specify configuration file"
 
-            assert os.path.isfile(
-                args.deepspeed_config
-            ), "DeepSpeed configuration file: {} is not an existing file".format(
-                args.deepspeed_config
-            )
-
     def _is_supported_optimizer(self, optimizer_name):
         return (optimizer_name in DEEPSPEED_OPTIMIZERS
                 or getattr(torch.optim,
                            optimizer_name,
                            None) is not None)
 
+    def _supported_optims(self):
+        FairseqOptimizer = None
+        try:
+            from fairseq.optim.fairseq_optimizer import FairseqOptimizer
+        except ImportError:
+            pass
+
+        expected_optim_types = [Optimizer]
+        if FairseqOptimizer:
+            # fairseq optims are not torch.optim objects
+            expected_optim_types.append(FairseqOptimizer)
+        return expected_optim_types
+
     # Validate configuration based on command line arguments
     def _do_sanity_check(self):
-        assert isinstance(self.client_optimizer, (type(None), Optimizer, Callable)), \
+        expected_optim_types = self._supported_optims()
+        expected_optim_types += [type(None), Callable]
+        assert isinstance(self.client_optimizer, tuple(expected_optim_types)), \
             f'Client Optimizer is of unexpected type {type(self.client_optimizer)}'
 
         if not self.client_optimizer:
@@ -940,6 +1101,7 @@ class DeepSpeedEngine(Module):
 
     @staticmethod
     def __check_params(model: Module, dtype: torch.dtype) -> None:
+        return
         if not all(param.dtype == dtype
                    for param in model.parameters()) and dist.get_rank() == 0:
             raise ValueError(
@@ -948,31 +1110,27 @@ class DeepSpeedEngine(Module):
                 f"{[(n, p.dtype) for n, p in model.named_parameters() if p.dtype != dtype]}"
             )
 
+    def _set_client_model(self, model):
+        # register client model in _modules so that nn.module methods work correctly
+        modules = self.__dict__.get('_modules')
+        modules['module'] = model
+        # register module attribute in engine but avoid getattr
+        self.__dict__['module'] = model
+
     def _configure_distributed_model(self, model):
-        self.module = model
+        self._set_client_model(model)
+
         if self.fp16_enabled():
             if self.zero_optimization_partition_weights() and any(
                 [hasattr(param,
                          "ds_id") for param in self.module.parameters()]):
-                if not all(
-                    [param.dtype == torch.half for param in self.module.parameters()]):
-                    names = [
-                        n for n,
-                        p in self.module.named_parameters() if p.dtype != torch.half
-                    ]
-                    raise ValueError(
-                        f"fp16 is enabled but the following parameters have dtype that is not fp16: {', '.join(names)}"
-                    )
+                self.__check_params(self.module, torch.half)
             self.module.half()
         elif self.bfloat16_enabled():
             if self.zero_optimization_partition_weights() and any(
                     hasattr(param,
                             'ds_id') for param in self.module.parameters()):
                 self.__check_params(self.module, torch.bfloat16)
-            if self.zero_optimization_stage() == 0 and not self.pipeline_parallelism:
-                raise NotImplementedError(
-                    "When not running ZeRO, BF16 training support is only supported for Pipeline parallelism"
-                )
             self.module.bfloat16()
         else:
             self.__check_params(self.module, torch.float)
@@ -1031,78 +1189,119 @@ class DeepSpeedEngine(Module):
             ])
             assert occurrence <= 1, f"Parameter with name: {name} occurs multiple times in optimizer.param_groups. Make sure it only appears once to prevent undefined behaviour."
 
+    def _do_optimizer_sanity_check(self, basic_optimizer):
+        model_dtype, grad_accum_dtype = self.get_data_types()
+        zero_enabled = self.zero_optimization()
+        amp_enabled = self.amp_enabled()
+        # config based assertions
+        assert (
+            not (amp_enabled and zero_enabled)
+        ), "Amp and ZeRO are not currently compatible, please use (legacy) fp16 mode which performs similar to amp opt_mode=O2"
+        if zero_enabled:
+            if not is_zero_supported_optimizer(basic_optimizer):
+                assert (
+                    self.zero_allow_untested_optimizer()
+                ), 'You are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.'
+
+                if self.global_rank == 0:
+                    logger.warning(
+                        "**** You are using ZeRO with an untested optimizer, proceed with caution *****"
+                    )
+
+            if model_dtype == torch.bfloat16 and grad_accum_dtype == torch.float32 and self.zero_optimization_stage(
+            ) == 1:
+                return BFLOAT16
+
+            if model_dtype != grad_accum_dtype:
+                raise NotImplementedError(
+                    "Model data type and gradient accumulation data type must be equal to use ZeRO"
+                )
+            return ZERO_OPTIMIZATION
+        elif amp_enabled:
+            if model_dtype != grad_accum_dtype:
+                raise NotImplementedError(
+                    "Model data type and gradient accumulation data type must be equal to use Amp"
+                )
+            if model_dtype == torch.bfloat16 or model_dtype == torch.float16:
+                raise NotImplementedError(
+                    "Cannot enable both amp with (legacy) fp16 or bfloat16 mode")
+            try:
+                logger.info("Initializing Apex amp from: {}".format(amp.__path__))
+            except NameError:
+                # If apex/amp is available it will be imported above
+                raise RuntimeError(
+                    "Unable to import apex/amp, please make sure it is installed")
+            return AMP
+        # data type checks
+        elif model_dtype == grad_accum_dtype:
+            if model_dtype == torch.bfloat16:
+                raise NotImplementedError(
+                    "Bfloat16 wrapper must use a gradient accumulation type of fp32, enable ZeRO to use Bfloat16 gradient accumulation"
+                )
+            if model_dtype == torch.float16:
+                return FP16
+            # else optimizer_wrapper = None
+        elif model_dtype == torch.bfloat16 and grad_accum_dtype == torch.float32:
+            return BFLOAT16
+        else:
+            raise NotImplementedError(
+                "unsupported mix of model dtype and gradient accummulation type")
+
+        return None
+
     # Configure optimizer
     def _configure_optimizer(self, client_optimizer, model_parameters):
         if client_optimizer is not None:
-            if isinstance(client_optimizer, Optimizer):
+            if isinstance(client_optimizer, tuple(self._supported_optims())):
                 client_optimizer.param_groups[:] = [
                     pg for pg in client_optimizer.param_groups if len(pg["params"]) != 0
                 ]
-                if self.global_rank == 0:
-                    logger.info(
-                        "Removing param_group that has no 'params' in the client Optimizer"
-                    )
+                log_dist(
+                    "Removing param_group that has no 'params' in the client Optimizer",
+                    ranks=[0])
 
                 basic_optimizer = client_optimizer
-                if self.global_rank == 0:
-                    logger.info('Using client Optimizer as basic optimizer')
+                log_dist('Using client Optimizer as basic optimizer', ranks=[0])
             else:
                 basic_optimizer = client_optimizer(model_parameters)
-                if self.global_rank == 0:
-                    logger.info('Using client callable to create basic optimizer')
+                log_dist('Using client callable to create basic optimizer', ranks=[0])
         else:
             basic_optimizer = self._configure_basic_optimizer(model_parameters)
-            if self.global_rank == 0:
-                logger.info(
-                    "Using DeepSpeed Optimizer param name {} as basic optimizer".format(
-                        self.optimizer_name()))
+            log_dist(
+                f"Using DeepSpeed Optimizer param name {self.optimizer_name()} as basic optimizer",
+                ranks=[0])
 
         self._check_for_duplicates(basic_optimizer)
 
         self.basic_optimizer = basic_optimizer
-        if self.global_rank == 0:
-            logger.info("DeepSpeed Basic Optimizer = {}".format(
-                basic_optimizer.__class__.__name__))
+        log_dist("DeepSpeed Basic Optimizer = {}".format(
+            basic_optimizer.__class__.__name__),
+                 ranks=[0])
 
-        if self.zero_optimization():
-            assert (
-                not self.amp_enabled()
-            ), "Amp and ZeRO are not currently compatible, please use (legacy) fp16 mode which performs similar to amp opt_mode=O2"
-            if not is_zero_supported_optimizer(basic_optimizer):
-                assert (
-                    self.zero_allow_untested_optimizer()
-                ), 'You are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.'
+        optimizer_wrapper = self._do_optimizer_sanity_check(basic_optimizer)
 
-                if self.global_rank == 0:
-                    logger.warning(
-                        "**** You are using ZeRO with an untested optimizer, proceed with caution *****"
-                    )
+        if optimizer_wrapper == ZERO_OPTIMIZATION:
             self.optimizer = self._configure_zero_optimizer(basic_optimizer)
-        elif self.amp_enabled():
-            assert not (self.fp16_enabled() or self.bfloat16_enabled()), "Cannot enable both amp with (legacy) fp16 or bfloat16 mode"
+        elif optimizer_wrapper == AMP:
             amp_params = self.amp_params()
-            if self.global_rank == 0:
-                logger.info(f"Initializing AMP with these params: {amp_params}")
-            try:
-                logger.info("Initializing Apex amp from: {}".format(amp.__path__))
-            except NameError:
-                # If apex/amp is available it will be imported above
-                raise RuntimeError(
-                    "Unable to import apex/amp, please make sure it is installed")
-            self.module, self.optimizer = amp.initialize(
+            log_dist(f"Initializing AMP with these params: {amp_params}", ranks=[0])
+            model, self.optimizer = amp.initialize(
                 self.module, basic_optimizer, **amp_params
             )
+            self._set_client_model(model)
             self._broadcast_model()
             # TODO: maybe need to broadcast experts differently?
-        elif self.fp16_enabled():
+        elif optimizer_wrapper == FP16:
             self.optimizer = self._configure_fp16_optimizer(basic_optimizer)
-        elif self.bfloat16_enabled():
+        elif optimizer_wrapper == BFLOAT16:
             self.optimizer = self._configure_bf16_optimizer(basic_optimizer)
         else:
             self.optimizer = basic_optimizer
+
         log_dist("DeepSpeed Final Optimizer = {}".format(self.optimizer_name()),
                  ranks=[0])
 
+        self.compression_scheduler = self._configure_compression_scheduler()
         self.quantizer = self._configure_quantization()
 
     def _configure_basic_optimizer(self, model_parameters):
@@ -1131,7 +1330,7 @@ class DeepSpeedEngine(Module):
                     optimizer = torch.optim.AdamW(model_parameters,
                                                   **optimizer_parameters)
             else:
-                if self.zero_cpu_offload():
+                if self.zero_use_cpu_optimizer():
                     if self.optimizer_name() == ADAGRAD_OPTIMIZER:
                         from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad
                         optimizer = DeepSpeedCPUAdagrad(model_parameters,
@@ -1185,13 +1384,16 @@ class DeepSpeedEngine(Module):
             optimizer = torch_optimizer(model_parameters, **optimizer_parameters)
         return optimizer
 
+    def _configure_compression_scheduler(self):
+        return compression_scheduler(self.module, self._config.compression_config)
+
+    def _configure_random_ltd_scheduler(self, configs):
+        return RandomLTDScheduler(configs)
+
     def _configure_quantization(self):
         (
+            quantize_weight_in_forward,
             quantize_enabled,
-            q_target_bits,
-            q_start_bits,
-            q_period,
-            q_offset,
             q_groups,
             q_mixed_fp16,
             q_change_ratio,
@@ -1200,15 +1402,13 @@ class DeepSpeedEngine(Module):
             q_verbose,
             use_quantizer_kernel,
         ) = self.quantize_training()
+        if quantize_enabled and not quantize_weight_in_forward:
+            assert self.fp16_enabled(), "MoQ (quantize in optimization step) weight quantization is only supported for FP16"
         quantizer = None
-        if quantize_enabled:
+        if quantize_enabled and not quantize_weight_in_forward:
             from deepspeed.runtime.quantize import Quantizer
 
             quantizer = Quantizer(
-                q_target_bits,
-                q_start_bits,
-                q_period,
-                q_offset,
                 q_groups,
                 q_mixed_fp16,
                 q_change_ratio,
@@ -1232,7 +1432,7 @@ class DeepSpeedEngine(Module):
         if isinstance(optimizer, fused_opts) \
                 or self.optimizer_name() in [ONEBIT_ADAM_OPTIMIZER, ZERO_ONE_ADAM_OPTIMIZER]:
             if self.dynamic_loss_scale():
-                log_dist("Creating fp16 optimizer with dynamic loss scale", ranks=[0])
+                log_dist(f'Creating fp16 optimizer with dynamic loss scale', ranks=[0])
                 timers = self.timers if self.wall_clock_breakdown() else None
                 optimizer = FP16_Optimizer(
                     optimizer,
@@ -1248,10 +1448,8 @@ class DeepSpeedEngine(Module):
                 )
             else:
                 log_dist(
-                    "Creating fp16 optimizer with static loss scale: {}".format(
-                        self.loss_scale()),
-                    ranks=[0],
-                )
+                    f'Creating fp16 optimizer with static loss scale: {self.loss_scale()}',
+                    ranks=[0])
                 optimizer = FP16_Optimizer(
                     optimizer,
                     deepspeed=self,
@@ -1262,7 +1460,7 @@ class DeepSpeedEngine(Module):
                     has_moe_layers=self.has_moe_layers,
                 )
         else:
-            log_dist("Creating fp16 unfused optimizer with dynamic loss scale",
+            log_dist(f'Creating fp16 unfused optimizer with dynamic loss scale',
                      ranks=[0])
             optimizer = FP16_UnfusedOptimizer(
                 optimizer,
@@ -1280,11 +1478,15 @@ class DeepSpeedEngine(Module):
     def _configure_bf16_optimizer(self, optimizer):
         clip_grad = self.gradient_clipping()
 
-        if self.global_rank == 0:
-            logger.info('Creating unfused BF16 optimizer')
+        if optimizer is None:
+            optimizer = DummyOptim(list(self.module.parameters()))
+
+        log_dist('Creating BF16 optimizer', ranks=[0])
+
         timers = self.timers if self.wall_clock_breakdown() else None
         optimizer = BF16_Optimizer(
             optimizer,
+            self.param_names,
             mpu=self.mpu,
             clip_grad=clip_grad,
             allgather_bucket_size=self.zero_allgather_bucket_size(),
@@ -1295,8 +1497,7 @@ class DeepSpeedEngine(Module):
 
     def _configure_zero_optimizer(self, optimizer):
         zero_stage = self.zero_optimization_stage()
-        log_dist('Creating fp16 ZeRO stage {} optimizer'.format(zero_stage), ranks=[0])
-        assert self.communication_data_type in (torch.float16, torch.bfloat16), "ZeRO supports only 'communication_data_type': ['fp16', 'bfp16']"
+        model_dtype, grad_accum_dtype = self.get_data_types()
         timers = self.timers if self.wall_clock_breakdown() else None
 
         if optimizer is None:
@@ -1307,17 +1508,21 @@ class DeepSpeedEngine(Module):
                 "The deprecated version of ZeRO Stage 1 is not supported in deepspeed >= 0.5.9. Please downgrade to a version less than 0.5.9 if you need to use this deprecated version of ZeRO."
             )
 
-        if zero_stage <= ZERO_OPTIMIZATION_GRADIENTS:
+        if zero_stage <= ZeroStageEnum.gradients:
             overlap_comm = self.zero_overlap_comm()
             contiguous_gradients = self.zero_contiguous_gradients()
             round_robin_gradients = self.zero_round_robin_gradients()
-            assert not isinstance(optimizer, DummyOptim), "zero stage 2 requires an optimizer"
+            assert not isinstance(optimizer, DummyOptim), "zero stage {} requires an optimizer".format(zero_stage)
 
+            log_dist(f'Creating {model_dtype} ZeRO stage {zero_stage} optimizer',
+                     ranks=[0])
             # Overlap and contiguous grads are meaningless in stage 1 and are ignored
-            if zero_stage == ZERO_OPTIMIZATION_OPTIMIZER_STATES:
+            if zero_stage == ZeroStageEnum.optimizer_states:
                 overlap_comm = False
-                contiguous_gradients = False
                 round_robin_gradients = False
+                # Non-MoE requires contiguous grads to be disabled w. stage 1
+                if not self.has_moe_layers:
+                    contiguous_gradients = False
 
             if isinstance(self.module, PipelineModule):
                 if overlap_comm:
@@ -1325,9 +1530,9 @@ class DeepSpeedEngine(Module):
                         "Pipeline parallelism does not support overlapped communication, will be disabled."
                     )
                     overlap_comm = False
-
             optimizer = DeepSpeedZeroOptimizer(
                 optimizer,
+                self.param_names,
                 timers=timers,
                 static_loss_scale=self.loss_scale(),
                 dynamic_loss_scale=self.dynamic_loss_scale(),
@@ -1349,7 +1554,7 @@ class DeepSpeedEngine(Module):
                 gradient_predivide_factor=self.gradient_predivide_factor(),
                 gradient_accumulation_steps=self.gradient_accumulation_steps(),
                 ignore_unused_parameters=self.zero_ignore_unused_parameters(),
-                partition_grads=zero_stage == ZERO_OPTIMIZATION_GRADIENTS,
+                partition_grads=zero_stage == ZeroStageEnum.gradients,
                 round_robin_gradients=round_robin_gradients,
                 has_moe_layers=self.has_moe_layers,
                 fp16_master_weights_and_gradients=self.fp16_master_weights_and_gradients(
@@ -1357,38 +1562,54 @@ class DeepSpeedEngine(Module):
                 communication_data_type=self.communication_data_type,
                 elastic_checkpoint=self.zero_elastic_checkpoint())
 
-        elif zero_stage == ZERO_OPTIMIZATION_WEIGHTS:
+        elif zero_stage == ZeroStageEnum.weights:
             assert not self.has_moe_layers, "MoE not supported with Stage 3"
-            logger.info("Initializing ZeRO Stage 3") if dist.get_rank() == 0 else None
-            from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
-
-            optimizer = DeepSpeedZeroOptimizer_Stage3(
-                self.module,
-                optimizer,
-                timers=timers,
-                ds_config=self.config,
-                static_loss_scale=self.loss_scale(),
-                dynamic_loss_scale=self.dynamic_loss_scale(),
-                dynamic_loss_args=self.dynamic_loss_scale_args(),
-                clip_grad=self.gradient_clipping(),
-                contiguous_gradients=self.zero_contiguous_gradients(),
-                reduce_bucket_size=self.zero_reduce_bucket_size(),
-                prefetch_bucket_size=self.zero_prefetch_bucket_size(),
-                max_reuse_distance=self.zero_max_reuse_distance(),
-                max_live_parameters=self.zero_max_live_parameters(),
-                param_persistence_threshold=self.zero_param_persistence_threshold(),
-                dp_process_group=self.data_parallel_group,
-                reduce_scatter=self.zero_reduce_scatter(),
-                overlap_comm=self.zero_overlap_comm(),
-                offload_optimizer_config=self.zero_offload_optimizer(),
-                offload_param_config=self.zero_offload_param(),
-                sub_group_size=self.zero_sub_group_size(),
-                mpu=self.mpu,
-                postscale_gradients=self.postscale_gradients(),
-                gradient_predivide_factor=self.gradient_predivide_factor(),
-                gradient_accumulation_steps=self.gradient_accumulation_steps(),
-                aio_config=self.aio_config(),
-                communication_data_type=self.communication_data_type)
+            if isinstance(optimizer, DummyOptim):
+                log_dist("Creating ZeRO Offload", ranks=[0])
+                optimizer = DeepSpeedZeRoOffload(
+                    self.module,
+                    timers=timers,
+                    ds_config=self.config,
+                    overlap_comm=self.zero_overlap_comm(),
+                    prefetch_bucket_size=self.zero_prefetch_bucket_size(),
+                    max_reuse_distance=self.zero_max_reuse_distance(),
+                    max_live_parameters=self.zero_max_live_parameters(),
+                    param_persistence_threshold=self.zero_param_persistence_threshold(),
+                    model_persistence_threshold=self.zero_model_persistence_threshold(),
+                    offload_param_config=self.zero_offload_param(),
+                    mpu=self.mpu)
+            else:
+                log_dist(f'Creating {model_dtype} ZeRO stage {zero_stage} optimizer',
+                         ranks=[0])
+                from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
+                optimizer = DeepSpeedZeroOptimizer_Stage3(
+                    self.module,
+                    optimizer,
+                    timers=timers,
+                    ds_config=self.config,
+                    static_loss_scale=self.loss_scale(),
+                    dynamic_loss_scale=self.dynamic_loss_scale(),
+                    dynamic_loss_args=self.dynamic_loss_scale_args(),
+                    clip_grad=self.gradient_clipping(),
+                    contiguous_gradients=self.zero_contiguous_gradients(),
+                    reduce_bucket_size=self.zero_reduce_bucket_size(),
+                    prefetch_bucket_size=self.zero_prefetch_bucket_size(),
+                    max_reuse_distance=self.zero_max_reuse_distance(),
+                    max_live_parameters=self.zero_max_live_parameters(),
+                    param_persistence_threshold=self.zero_param_persistence_threshold(),
+                    model_persistence_threshold=self.zero_model_persistence_threshold(),
+                    dp_process_group=self.data_parallel_group,
+                    reduce_scatter=self.zero_reduce_scatter(),
+                    overlap_comm=self.zero_overlap_comm(),
+                    offload_optimizer_config=self.zero_offload_optimizer(),
+                    offload_param_config=self.zero_offload_param(),
+                    sub_group_size=self.zero_sub_group_size(),
+                    mpu=self.mpu,
+                    postscale_gradients=self.postscale_gradients(),
+                    gradient_predivide_factor=self.gradient_predivide_factor(),
+                    gradient_accumulation_steps=self.gradient_accumulation_steps(),
+                    aio_config=self.aio_config(),
+                    communication_data_type=self.communication_data_type)
 
         else:
             raise NotImplementedError("ZeRO stage {} not implemented".format(zero_stage))
@@ -1413,8 +1634,8 @@ class DeepSpeedEngine(Module):
 
         return pld
 
-    def _configure_curriculum_scheduler(self):
-        scheduler = CurriculumScheduler(self.curriculum_params())
+    def _configure_curriculum_scheduler_legacy(self):
+        scheduler = CurriculumScheduler(self.curriculum_params_legacy())
         return scheduler
 
     @staticmethod
@@ -1432,11 +1653,9 @@ class DeepSpeedEngine(Module):
 
     def was_step_applied(self) -> bool:
         """Returns True if the latest ``step()`` produced in parameter updates.
-
         Note that a ``False`` return is not an error condition. Steps are frequently
         no-ops, such as between gradient accumulation boundaries or when overflows
         occur.
-
         Returns:
             bool: Whether the latest ``step()`` modified model parameters.
         """
@@ -1454,9 +1673,6 @@ class DeepSpeedEngine(Module):
                 or self.is_iterable_style_dataset(dataset)):
             raise ValueError("Training data must be a torch Dataset")
 
-        if data_sampler is None and (route == ROUTE_PREDICT or route == ROUTE_EVAL):
-            data_sampler = torch.utils.data.SequentialSampler(dataset)
-
         if batch_size is None:
             batch_size = self.train_micro_batch_size_per_gpu()
 
@@ -1469,23 +1685,50 @@ class DeepSpeedEngine(Module):
             deepspeed_io_timer = self.tput_timer
 
         # If mpu is provided, forward world size and parallel rank to sampler.
-        data_parallel_world_size = None
-        data_parallel_rank = None
+        data_parallel_world_size = self.dp_world_size
+        data_parallel_rank = self.global_rank
         if self.mpu is not None:
             data_parallel_world_size = self.mpu.get_data_parallel_world_size()
             data_parallel_rank = self.mpu.get_data_parallel_rank()
 
-        return DeepSpeedDataLoader(dataset=dataset,
-                                   batch_size=batch_size,
-                                   pin_memory=pin_memory,
-                                   collate_fn=collate_fn,
-                                   local_rank=self.local_rank,
-                                   tput_timer=deepspeed_io_timer,
-                                   num_local_io_workers=num_local_io_workers,
-                                   data_sampler=data_sampler,
-                                   data_parallel_world_size=data_parallel_world_size,
-                                   data_parallel_rank=data_parallel_rank,
-                                   dataloader_drop_last=self.dataloader_drop_last())
+        if data_sampler is None and (route == ROUTE_PREDICT or route == ROUTE_EVAL):
+            data_sampler = torch.utils.data.DistributedSampler(
+                dataset,
+                num_replicas=data_parallel_world_size,
+                rank=data_parallel_rank,
+                shuffle=False,
+            )
+
+        deepspeed_dataloader_config = {}
+        if self.curriculum_learning_enabled():
+            deepspeed_dataloader_config = {
+                CURRICULUM_LEARNING:
+                self.curriculum_learning_enabled(),
+                DATA_EFFICIENCY:
+                self.data_efficiency_config(),
+                DATA_PARALLEL_GROUP:
+                self.data_parallel_group,
+                GRADIENT_ACCUMULATION_STEPS:
+                self.gradient_accumulation_steps(),
+                GLOBAL_RANK:
+                self.global_rank,
+                DATA_SAMPLING_NUM_WORKERS:
+                self.data_sampling_config()[DATA_SAMPLING_NUM_WORKERS]
+            }
+
+        return DeepSpeedDataLoader(
+            dataset=dataset,
+            batch_size=batch_size,
+            pin_memory=pin_memory,
+            collate_fn=collate_fn,
+            local_rank=self.local_rank,
+            tput_timer=deepspeed_io_timer,
+            num_local_io_workers=num_local_io_workers,
+            data_sampler=data_sampler,
+            data_parallel_world_size=data_parallel_world_size,
+            data_parallel_rank=data_parallel_rank,
+            dataloader_drop_last=self.dataloader_drop_last(),
+            deepspeed_dataloader_config=deepspeed_dataloader_config)
 
     def train(self, mode=True):
         r""""""
@@ -1536,23 +1779,41 @@ class DeepSpeedEngine(Module):
                                  == self.flops_profiler_profile_step()
                                  and self.global_rank == 0)
 
+        # used to check quantization happens at step 0!
+        if self.global_steps == 0 and hasattr(self, "compression_scheduler"):
+            self.compression_scheduler.step(step_zero_check=True)
+            if self.quantizer:
+                tensor_to_quantize = self.optimizer.bit16_groups if self.zero_optimization_stage(
+                ) == 2 else self.optimizer.fp16_groups
+                if self.compression_scheduler.weight_quantization_enabled:
+                    self.quantizer.quantize(
+                        tensor_to_quantize,
+                        (self.optimizer.overflow if self.fp16_enabled() else False),
+                        self.eigenvalue_enabled(),
+                        None,
+                    )
+
         if flops_profiler_active:
             self.flops_profiler.start_profile(ignore_list=None)
 
-        if self.module.training and self.progressive_layer_drop:
-            kwargs.update(self.progressive_layer_drop.get_state())
+        if self.module.training:
+            if self.progressive_layer_drop:
+                kwargs.update(self.progressive_layer_drop.get_state())
 
         if self.__class__.__name__ != "PipelineEngine":
             # TODO: The above if condition is a HACK since for PipelineEngine
             # it's difficult to inject argument in forward pass.
-            if self.module.training and self.curriculum_enabled():
-                self.curriculum_scheduler.update_difficulty(self.global_steps + 1)
-                if self.curriculum_params()["curriculum_type"] == "seqlen":
+            if self.module.training and self.curriculum_enabled_legacy():
+                self.curriculum_scheduler_legacy.update_difficulty(self.global_steps + 1)
+                if self.curriculum_params_legacy()["curriculum_type"] == "seqlen":
                     kwargs.update({
                         "curriculum_seqlen":
-                        self.curriculum_scheduler.get_current_difficulty()
+                        self.curriculum_scheduler_legacy.get_current_difficulty()
                     })
 
+        if self.module.training and self.random_ltd_enabled():
+            self.random_ltd_scheduler.update_seq(self.global_steps)
+
         if self.zero_optimization_partition_weights():
             # Enable automated discovery of external parameters by indicating that
             # we are in a forward pass.
@@ -1565,6 +1826,9 @@ class DeepSpeedEngine(Module):
         if self.training_dataloader is None:
             self.tput_timer.start()
 
+        if self.fp16_auto_cast():
+            inputs = self._cast_inputs_half(inputs)
+
         loss = self.module(*inputs, **kwargs)
 
         if self.zero_optimization_partition_weights():
@@ -1588,6 +1852,22 @@ class DeepSpeedEngine(Module):
             see_memory_usage("Engine after forward", force=self.memory_breakdown())
         return loss
 
+    def _cast_inputs_half(self, inputs):
+        if isinstance(inputs, (list, tuple)):
+            new_inputs = []
+            for v in inputs:
+                new_inputs.append(self._cast_inputs_half(v))
+            return inputs.__class__(new_inputs)
+        elif isinstance(inputs, dict):
+            new_inputs = {}
+            for k, v in inputs.items():
+                new_inputs[k] = self._cast_inputs_half(v)
+            return new_inputs
+        elif hasattr(inputs, 'half'):
+            return inputs.half()
+        else:
+            return inputs
+
     def print_forward_breakdown(self, fwd_time):
         gate_time = 0.0
         moe_time = 0.0
@@ -1606,9 +1886,9 @@ class DeepSpeedEngine(Module):
 
         # TODO: Allreduce/average them across ranks for more accurate timing.
 
-        # if torch.distributed.get_rank() == 0:
+        # if deepspeed.comm.get_rank() == 0:
         log_dist(
-            f"rank={torch.distributed.get_rank()} time (ms) | forward: {fwd_time:.2f} (forward_moe: {moe_time:.2f}, 1st alltoall: {falltoall:.2f}, 2nd alltoall: {salltoall:.2f}, top-k: {gate_time:.2f})",
+            f"rank={dist.get_rank()} time (ms) | forward: {fwd_time:.2f} (forward_moe: {moe_time:.2f}, 1st alltoall: {falltoall:.2f}, 2nd alltoall: {salltoall:.2f}, top-k: {gate_time:.2f})",
             ranks=[0])
 
     @instrument_w_nvtx
@@ -1619,41 +1899,49 @@ class DeepSpeedEngine(Module):
         # Pass (PP) gas boundary flag to optimizer (required for zero)
         self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary(
         )
-
-        # ZeRO stage 2 communicates during non gradient accumulation boundaries as well
+        # ZeRO stage >= 2 communicates during non gradient accumulation boundaries as well
         if self.zero_optimization_partition_gradients():
             self.optimizer.overlapping_partition_gradients_reduce_epilogue()
 
         # Communicate only at gradient accumulation boundaries
         elif self.is_gradient_accumulation_boundary():
-            if self.zero_optimization_stage() == ZERO_OPTIMIZATION_OPTIMIZER_STATES:
+            if self.zero_optimization_stage() == ZeroStageEnum.optimizer_states:
                 self.optimizer.reduce_gradients(
                     pipeline_parallel=self.pipeline_parallelism)
             else:
                 self.buffered_allreduce_fallback(elements_per_buffer=bucket_size)
 
     @instrument_w_nvtx
-    def backward(self, loss, allreduce_gradients=True, release_loss=False):
+    def backward(self,
+                 loss,
+                 allreduce_gradients=True,
+                 release_loss=False,
+                 retain_graph=False,
+                 scale_wrt_gas=True):
         r"""Execute backward pass on the loss
-
         Arguments:
             loss: Torch tensor on which to execute backward propagation
             allreduce_gradients: is deprecated, ignored, and will soon be removed'
+            retain_graph: bool, default: false
+                forward on user defined choice of retain_graph
         """
 
         see_memory_usage("Engine before backward", force=self.memory_breakdown())
 
+        if self.scale_wrt_gas is not None:
+            scale_wrt_gas = self.scale_wrt_gas
+
         if not allreduce_gradients:
             logger.warning(
                 f"Argument `allreduce_gradients` is deprecated, ignored, and will soon be removed"
             )
 
         # scale loss w.r.t. gradient accumulation if needed
-        if self.gradient_accumulation_steps() > 1:
+        if self.gradient_accumulation_steps() > 1 and scale_wrt_gas:
             loss = self._scale_loss_by_gas(loss.float())
 
         # Log training Loss
-        if self.tensorboard_enabled():
+        if self.monitor.enabled:
             if self.is_gradient_accumulation_boundary():
                 if self.global_rank == 0:
                     self.summary_events = [(
@@ -1661,9 +1949,7 @@ class DeepSpeedEngine(Module):
                         loss.mean().item() * self.gradient_accumulation_steps(),
                         self.global_samples,
                     )]
-                    for event in self.summary_events:  # write_summary_events
-                        self.summary_writer.add_scalar(event[0], event[1], event[2])
-                    self.summary_writer.flush()
+                    self.monitor.write_events(self.summary_events)
 
         self._start_timers(self.engine_timers.backward_timers)
 
@@ -1673,9 +1959,9 @@ class DeepSpeedEngine(Module):
         self._start_timers(self.engine_timers.backward_inner_timers)
 
         if self.zero_optimization():
-            self.optimizer.is_gradient_accumulation_boundary = (
-                self.is_gradient_accumulation_boundary())
-            self.optimizer.backward(loss)
+            self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary(
+            )
+            self.optimizer.backward(loss, retain_graph=retain_graph)
         elif self.amp_enabled():
             # AMP requires delaying unscale when inside gradient accumulation boundaries
             # https://nvidia.github.io/apex/advanced.html#gradient-accumulation-across-iterations
@@ -1683,19 +1969,19 @@ class DeepSpeedEngine(Module):
             with amp.scale_loss(loss,
                                 self.optimizer,
                                 delay_unscale=delay_unscale) as scaled_loss:
-                scaled_loss.backward()
+                scaled_loss.backward(retain_graph=retain_graph)
         elif self.fp16_enabled():
             if self.eigenvalue_enabled():
                 self.optimizer.backward(loss, create_graph=True, retain_graph=True)
             else:
-                self.optimizer.backward(loss)
+                self.optimizer.backward(loss, retain_graph=retain_graph)
         elif self.bfloat16_enabled():
             self.optimizer.backward(loss)
         else:
             if self.eigenvalue_enabled():
                 loss.backward(create_graph=True, retain_graph=True)
             else:
-                loss.backward()
+                loss.backward(retain_graph=retain_graph)
 
         self._stop_timers(self.engine_timers.backward_inner_timers)
 
@@ -1718,11 +2004,14 @@ class DeepSpeedEngine(Module):
         return loss
 
     def is_gradient_accumulation_boundary(self):
-        """Query whether the current micro-batch is at the boundary of
+        """
+        Query whether the current micro-batch is at the boundary of
         gradient accumulation, and thus will trigger gradient reductions and
         an optimizer step.
+
         Returns:
             bool: if the current step is a gradient accumulation boundary.
+
         """
         if self._is_gradient_accumulation_boundary is None:
             return (self.micro_steps + 1) % \
@@ -1731,14 +2020,13 @@ class DeepSpeedEngine(Module):
             return self._is_gradient_accumulation_boundary
 
     def set_gradient_accumulation_boundary(self, is_boundary):
-        """Manually overrides the DeepSpeed engine's gradient accumulation boundary state, this is an optional
+        """
+        Manually overrides the DeepSpeed engine's gradient accumulation boundary state, this is an optional
         feature and should be used with care. The state should be set before to the intended
         value before each forward/backward. The final fordward/backward should have the
         boundary state set to True. This style allows client code to only call engine.step() once after all
         the gradient accumulation passes are complete. See example below:
-
         .. code-block:: python
-
         engine.set_gradient_accumulation_boundary(False)
         for _ in range(gradient_accumulation_steps - 1):
             micro_batch = next(data_loader)
@@ -1749,7 +2037,6 @@ class DeepSpeedEngine(Module):
         loss = engine(micro_batch)
         engine.backward(loss)
         engine.step()
-
         Arguments:
             is_boundary (bool): are we at a gradient accumulation boundary or not?
         """
@@ -1787,17 +2074,15 @@ class DeepSpeedEngine(Module):
 
         # Quantize the updated parameter if there is no overflow
         if self.quantizer:
-            if self.fp16_enabled():
-                tensor_to_quantize = self.optimizer.bit16_groups if self.zero_optimization_stage(
-                ) == 2 else self.optimizer.fp16_groups
-            else:
-                tensor_to_quantize = self.optimizer.param_groups
-            self.quantizer.quantize(
-                tensor_to_quantize,
-                (self.optimizer.overflow if self.fp16_enabled() else False),
-                self.eigenvalue_enabled(),
-                block_eigenvalue,
-            )
+            tensor_to_quantize = self.optimizer.bit16_groups if self.zero_optimization_stage(
+            ) == 2 else self.optimizer.fp16_groups
+            if self.compression_scheduler.weight_quantization_enabled:
+                self.quantizer.quantize(
+                    tensor_to_quantize,
+                    (self.optimizer.overflow if self.fp16_enabled() else False),
+                    self.eigenvalue_enabled(),
+                    block_eigenvalue,
+                )
         # zero grad in basic optimizer could be unreliable and may not exhibit
         # the behaviour that we want
         if self.bfloat16_enabled():
@@ -1822,6 +2107,7 @@ class DeepSpeedEngine(Module):
         if overflow:
             self.skipped_steps += 1
         else:
+            self.compression_scheduler.step()
             if self.lr_scheduler is not None:
                 try:
                     self.lr_scheduler.step(**(lr_kwargs or {}))
@@ -1854,7 +2140,7 @@ class DeepSpeedEngine(Module):
         assert self.optimizer is not None and not isinstance(self.optimizer, DummyOptim), \
             "must provide optimizer during init in order to use step"
 
-        report_progress = self.global_rank == 0 if self.global_rank else True
+        report_progress = False
 
         self._step_applied = False  # assume False, will flip to True
 
@@ -1881,19 +2167,21 @@ class DeepSpeedEngine(Module):
             else:
                 self._take_model_step(lr_kwargs)
 
-        self.tput_timer.stop(report_progress)
+            report_progress = self.global_rank == 0 if self.global_rank else True
+
+        self.tput_timer.stop(global_step=self.is_gradient_accumulation_boundary(),
+                             report_speed=report_progress)
 
         self._stop_timers(self.engine_timers.step_timers)
 
         # Log learning rate
-        if self.tensorboard_enabled():
+        if self.monitor.enabled:
             if self.is_gradient_accumulation_boundary():
                 if self.global_rank == 0:
                     self.summary_events = [(f"Train/Samples/lr",
                                             self.get_lr()[0],
                                             self.global_samples)]
-                    for event in self.summary_events:  # write_summary_events
-                        self.summary_writer.add_scalar(event[0], event[1], event[2])
+
                     if self.fp16_enabled() and hasattr(self.optimizer, "cur_scale"):
                         self.summary_events.append((
                             f"Train/Samples/loss_scale",
@@ -1905,16 +2193,12 @@ class DeepSpeedEngine(Module):
                             self.eigenvalue_gas_boundary_resolution()):
                         ev_values = self.block_eigenvalue.values()
                         for i in range(len(ev_values)):
-                            self.summary_writer.add_scalar(
+                            self.summary_events.append((
                                 f"Train/Eigenvalues/ModelBlockParam_{i}",
                                 self.ev_values[i][0],
                                 self.global_samples,
-                            )
-                            self.summary_writer.flush()
-
-                    for event in self.summary_events:  # write_summary_events
-                        self.summary_writer.add_scalar(event[0], event[1], event[2])
-                    self.summary_writer.flush()
+                            ))
+                    self.monitor.write_events(self.summary_events)
 
         # Check flops profiling
         if flops_profiler_active:
@@ -1942,12 +2226,11 @@ class DeepSpeedEngine(Module):
         if self.wall_clock_breakdown() or self.flops_profiler_enabled():
             # Log global timing and reset
             if self.is_gradient_accumulation_boundary():
-                if self.tensorboard_enabled():
-                    self._write_tensorboard()
+                if self.monitor.enabled:
+                    self._write_monitor()
 
                 if self.has_moe_layers:
-                    fwd_time = self.timers(FORWARD_GLOBAL_TIMER).elapsed(
-                        reset=False) * 1000
+                    fwd_time = self.timers(FORWARD_GLOBAL_TIMER).elapsed(reset=False)
                     self.print_forward_breakdown(fwd_time=fwd_time)
 
                 self.timers.log(self.engine_timers.global_timers)
@@ -1977,49 +2260,48 @@ class DeepSpeedEngine(Module):
             titer = msg[FORWARD_GLOBAL_TIMER] + msg[BACKWARD_GLOBAL_TIMER] + msg[
                 STEP_GLOBAL_TIMER]
             msg["latency"] = titer
-            msg["FLOPS_per_gpu"] = self.flops * self.gradient_accumulation_steps(
+            msg["FLOPS_per_gpu"] = self.flops * 1_000_000 * self.gradient_accumulation_steps(
             ) / titer
-            msg["throughput"] = self.train_batch_size() * 1000 / \
+            msg["throughput"] = self.train_batch_size() * 1_000_000 / \
                 msg["latency"]
             print_json_dist(msg, [0], path=self.autotuning_metric_path())
+            log_dist(
+                f"Wrote metrics to {self.autotuning_metric_path()}, {os.path.abspath(self.autotuning_metric_path())}",
+                ranks=[0])
             import atexit
             atexit.register(print, "Autotuning: done with running current ds config.")
         exit()
 
-    def _write_tensorboard(self):
+    def _write_monitor(self):
         if self.global_rank == 0:
             self.summary_events = [
                 (
                     f"Train/Samples/elapsed_time_ms_forward",
-                    self.timers(FORWARD_GLOBAL_TIMER).elapsed(reset=False) * 1000.0,
+                    self.timers(FORWARD_GLOBAL_TIMER).elapsed(reset=False),
                     self.global_samples,
                 ),
                 (
                     f"Train/Samples/elapsed_time_ms_backward",
-                    self.timers(BACKWARD_GLOBAL_TIMER).elapsed(reset=False) * 1000.0,
+                    self.timers(BACKWARD_GLOBAL_TIMER).elapsed(reset=False),
                     self.global_samples,
                 ),
                 (
                     f"Train/Samples/elapsed_time_ms_backward_inner",
-                    self.timers(BACKWARD_INNER_GLOBAL_TIMER).elapsed(reset=False) *
-                    1000.0,
+                    self.timers(BACKWARD_INNER_GLOBAL_TIMER).elapsed(reset=False),
                     self.global_samples,
                 ),
                 (
                     f"Train/Samples/elapsed_time_ms_backward_allreduce",
-                    self.timers(BACKWARD_REDUCE_GLOBAL_TIMER).elapsed(reset=False) *
-                    1000.0,
+                    self.timers(BACKWARD_REDUCE_GLOBAL_TIMER).elapsed(reset=False),
                     self.global_samples,
                 ),
                 (
                     f"Train/Samples/elapsed_time_ms_step",
-                    self.timers(STEP_GLOBAL_TIMER).elapsed(reset=False) * 1000.0,
+                    self.timers(STEP_GLOBAL_TIMER).elapsed(reset=False),
                     self.global_samples,
                 ),
             ]
-            for event in self.summary_events:  # write_summary_events
-                self.summary_writer.add_scalar(event[0], event[1], event[2])
-            self.summary_writer.flush()
+            self.monitor.write_events(self.summary_events)
 
     def _get_optimizer_param(self, param_name):
         result = []
@@ -2121,7 +2403,8 @@ class DeepSpeedEngine(Module):
 
             grad_data = param.grad.data
             if param_name in self.sparse_tensor_module_names or grad_data.is_sparse:
-                grad_data = SparseTensor(grad_data)
+                # Call param.grad without data to avoid problem with setting of updated grads
+                grad_data = SparseTensor(param.grad)
 
             if is_moe_param(param):
                 expert_grads[param.group_name].append(grad_data)
@@ -2191,9 +2474,6 @@ class DeepSpeedEngine(Module):
         return sparse_list
 
     def sparse_allreduce(self, sparse, dp_group):
-        # Pre-divide for fp16 stability
-        sparse.values.mul_(1.0 / dist.get_world_size(group=dp_group))
-
         original_data_type = sparse.values.dtype
         if self.communication_data_type != sparse.values.dtype:
             if self.communication_data_type in (torch.float16, torch.bfloat16):
@@ -2205,6 +2485,13 @@ class DeepSpeedEngine(Module):
             indices = sparse.indices
             values = sparse.values
 
+        if self.postscale_gradients():
+            if self.gradient_average:
+                values.mul_(self.gradient_predivide_factor() /
+                            dist.get_world_size(group=dp_group))
+        else:
+            values.mul_(1. / dist.get_world_size(group=dp_group))
+
         indices_device_list = self.sparse_all_gather(indices, dp_group)
         values_device_list = self.sparse_all_gather(values, dp_group)
 
@@ -2257,6 +2544,8 @@ class DeepSpeedEngine(Module):
 
     def module_state_dict(self, destination=None, prefix="", keep_vars=False):
         sd = self.module.state_dict(destination, prefix, keep_vars)
+        if self.random_ltd_enabled():
+            sd = remove_random_ltd_state_dict(sd)
         return sd
 
     @staticmethod
@@ -2266,7 +2555,8 @@ class DeepSpeedEngine(Module):
                             old_moe_load,
                             model=None,
                             mpu=None,
-                            num_experts=1):
+                            num_experts=1,
+                            checkpoint_engine=TorchCheckpointEngine()):
         if old_moe_load:
             expp_rank = groups._get_expert_data_parallel_rank(
                 groups._get_max_expert_size_name())
@@ -2276,7 +2566,7 @@ class DeepSpeedEngine(Module):
                     groups._get_max_expert_size_name())
             for local_expert_id in range(num_local_experts):
                 global_expert_id = expp_rank * num_local_experts + local_expert_id
-                expert_state_dict = torch.load(DeepSpeedEngine._get_expert_ckpt_name(
+                expert_state_dict = checkpoint_engine.load(DeepSpeedEngine._get_expert_ckpt_name(
                     checkpoint_path,
                     -1, # -1 means ignore layer_id
                     global_expert_id,
@@ -2295,14 +2585,14 @@ class DeepSpeedEngine(Module):
         else:
             moe_layer_id = 0
             for n_module, module in model.named_modules():
-                if isinstance(module, MoE):  # and torch.distributed.get_rank() == 0:
+                if isinstance(module, MoE):  # and deepspeed.comm.get_rank() == 0:
                     group_name = module.expert_group_name
                     num_local_experts = module.num_local_experts
                     expp_rank = groups._get_expert_parallel_rank(group_name)
                     # loop all local_experts
                     for local_expert_id in range(num_local_experts):
                         global_expert_id = expp_rank * num_local_experts + local_expert_id
-                        expert_state_dict = torch.load(
+                        expert_state_dict = checkpoint_engine.load(
                             DeepSpeedEngine._get_expert_ckpt_name(
                                 checkpoint_path,
                                 moe_layer_id,
@@ -2321,8 +2611,12 @@ class DeepSpeedEngine(Module):
                         state_dict.update(expert_state_dict)
                     moe_layer_id += 1
 
-    def load_module_state_dict(self, state_dict, strict=True):
-        self.module.load_state_dict(state_dict, strict=strict)
+    def load_module_state_dict(self, state_dict, strict=True, custom_load_fn=None):
+        if custom_load_fn:
+            custom_load_fn(src=state_dict, dst=self.module)
+        else:
+            self.module.load_state_dict(state_dict, # TODO
+                                        strict=strict)
 
     def _get_zero_ckpt_prefix(self, dp_rank, bf16_mode):
         return f'{"bf16_" if bf16_mode else ""}zero_pp_rank_{dp_rank}'
@@ -2343,7 +2637,7 @@ class DeepSpeedEngine(Module):
 
     def _get_zero_ckpt_name(self, checkpoints_path, tag):
         mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
-        pp_rank = torch.distributed.get_rank(group=self.optimizer.dp_process_group)
+        pp_rank = dist.get_rank(group=self.optimizer.dp_process_group)
         bf16_mode = self.bfloat16_enabled()
         return self._get_rank_zero_ckpt_name(checkpoints_path,
                                              tag,
@@ -2360,7 +2654,7 @@ class DeepSpeedEngine(Module):
 
         if self.zero_optimization_partition_weights():
             filename = "zero_pp_rank_{}".format(
-                torch.distributed.get_rank(group=self.optimizer.dp_process_group))
+                dist.get_rank(group=self.optimizer.dp_process_group))
             ckpt_name = os.path.join(
                 checkpoints_path,
                 str(tag),
@@ -2417,8 +2711,10 @@ class DeepSpeedEngine(Module):
                         load_module_strict=True,
                         load_optimizer_states=True,
                         load_lr_scheduler_states=True,
-                        load_module_only=False):
-        """Load training checkpoint
+                        load_module_only=False,
+                        custom_load_fn=None):
+        """
+        Load training checkpoint
 
         Arguments:
             load_dir: Required. Directory to load the checkpoint from
@@ -2427,30 +2723,38 @@ class DeepSpeedEngine(Module):
             load_optimizer_states: Optional. Boolean to load the training optimizer states from Checkpoint. Ex. ADAM's momentum and variance
             load_lr_scheduler_states: Optional. Boolean to add the learning rate scheduler states from Checkpoint.
             load_module_only: Optional. Boolean to load only the model weights from the checkpoint. Ex. warmstarting.
+            custom_load_fn: Optional. Custom model load function.
+
         Returns:
             A tuple of ``load_path`` and ``client_state``.
-
             *``load_path``: Path of the loaded checkpoint. ``None`` if loading the checkpoint failed.
-
             *``client_state``: State dictionary used for loading required training states in the client code.
 
         Important: under ZeRO3, one cannot load checkpoint with ``engine.load_checkpoint()`` right
         after ``engine.save_checkpoint()``. It is because ``engine.module`` is partitioned, and
         ``load_checkpoint()`` wants a pristine model. If insisting to do so, please reinitialize engine
         before ``load_checkpoint()``.
+
         """
 
         if tag is None:
-            latest_path = os.path.join(load_dir, "latest")
+            latest_tag = "latest_universal" if self.load_universal_checkpoint(
+            ) else "latest"
+            latest_path = os.path.join(load_dir, latest_tag)
             if os.path.isfile(latest_path):
                 with open(latest_path, "r") as fd:
                     tag = fd.read().strip()
             else:
-                logger.warning(
-                    f"Unable to find latest file at {latest_path}, if trying to load latest "
-                    "checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint."
-                )
-                return None, None
+                if self.load_universal_checkpoint():
+                    raise ValueError(
+                        f'Invalid for universal checkpoint: {latest_path} does not exist'
+                    )
+                else:
+                    logger.warning(
+                        f"Unable to find latest file at {latest_path}, if trying to load latest "
+                        "checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint."
+                    )
+                    return None, None
 
         if self.zero_optimization_partition_weights():
             # Prepare for checkpoint load by ensuring all parameters are partitioned
@@ -2461,7 +2765,8 @@ class DeepSpeedEngine(Module):
                                                          load_module_strict=load_module_strict,
                                                          load_optimizer_states=load_optimizer_states,
                                                          load_lr_scheduler_states=load_lr_scheduler_states,
-                                                         load_module_only=load_module_only)
+                                                         load_module_only=load_module_only,
+                                                         custom_load_fn=custom_load_fn)
 
         load_zero_checkpoint = self.zero_optimization() or self.bfloat16_enabled()
         if load_zero_checkpoint and load_path is not None:
@@ -2483,12 +2788,15 @@ class DeepSpeedEngine(Module):
                          load_module_strict=True,
                          load_optimizer_states=True,
                          load_lr_scheduler_states=True,
-                         load_module_only=False):
+                         load_module_only=False,
+                         custom_load_fn=None):
 
         from deepspeed.runtime.state_dict_factory import SDLoaderFactory
 
         ckpt_list = self._get_all_ckpt_names(load_dir, tag)
-        sd_loader = SDLoaderFactory.get_sd_loader(ckpt_list)
+        sd_loader = SDLoaderFactory.get_sd_loader(
+            ckpt_list,
+            checkpoint_engine=self.checkpoint_engine)
 
         is_pipe_parallel = isinstance(self.module, PipelineModule)
 
@@ -2515,10 +2823,12 @@ class DeepSpeedEngine(Module):
                                                 old_moe_load=old_moe_load,
                                                 model=self.module,
                                                 mpu=self.mpu,
-                                                num_experts=self.num_experts)
-
-        self.load_module_state_dict(state_dict=checkpoint['module'],
-                                    strict=load_module_strict)
+                                                num_experts=self.num_experts,
+                                                checkpoint_engine=self.checkpoint_engine)
+        if not self.load_universal_checkpoint():
+            self.load_module_state_dict(state_dict=checkpoint['module'],
+                                        strict=load_module_strict,
+                                        custom_load_fn=custom_load_fn)
 
         self.loaded_checkpoint_dp_world_size = checkpoint['dp_world_size']
 
@@ -2531,8 +2841,9 @@ class DeepSpeedEngine(Module):
                 largest_group_name = groups._get_max_expert_size_name()
                 expp_rank = groups._get_expert_parallel_rank(largest_group_name)
                 optim_load_path = self._get_optimizer_ckpt_name(load_dir, tag, expp_rank)
-                optim_checkpoint = torch.load(optim_load_path,
-                                              map_location=torch.device('cpu'))
+                optim_checkpoint = self.checkpoint_engine.load(
+                    optim_load_path,
+                    map_location=torch.device('cpu'))
             else:
                 optim_checkpoint = checkpoint
 
@@ -2549,6 +2860,15 @@ class DeepSpeedEngine(Module):
             if load_lr_scheduler_states and self.lr_scheduler is not None:
                 self.lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
 
+            if self.random_ltd_enabled(
+            ) and self.random_ltd_scheduler is not None and 'random_ltd' in checkpoint:
+                self.random_ltd_scheduler.load_state_dict(checkpoint['random_ltd'])
+
+            if self.training_dataloader is not None and self.curriculum_learning_enabled(
+            ) and 'data_sampler' in checkpoint:
+                self.training_dataloader.data_sampler.load_state_dict(
+                    checkpoint['data_sampler'])
+
             def get_sparse_tensor_module_names(original_set,
                                                loaded_set,
                                                original_parameters,
@@ -2595,7 +2915,9 @@ class DeepSpeedEngine(Module):
                 'skipped_steps',
                 'global_steps',
                 'dp_world_size',
-                'mp_world_size'
+                'mp_world_size',
+                'data_sampler',
+                'random_ltd'
             ]
         client_state = {}
 
@@ -2616,25 +2938,35 @@ class DeepSpeedEngine(Module):
         return load_path, client_state
 
     def _load_zero_checkpoint(self, load_dir, tag, load_optimizer_states=True):
-        zero_sd_list = self._get_all_zero_checkpoints(load_dir, tag)
-        if zero_sd_list is None:
-            return False
-
-        if load_optimizer_states and self.dp_world_size != self.loaded_checkpoint_dp_world_size:
-            raise ZeRORuntimeException("The checkpoint being loaded used a DP " \
-                f"world size of {self.loaded_checkpoint_dp_world_size} but the " \
-                f"current world size is {self.dp_world_size}. Automatic adjustment " \
-                "of ZeRO's optimizer state partitioning with a new world size is not " \
-                "currently supported.")
+        if self.load_universal_checkpoint():
+            zero_sd_list = None
+            checkpoint_folder = f'{os.path.join(load_dir, tag)}'
+        else:
+            if load_optimizer_states and self.dp_world_size != self.loaded_checkpoint_dp_world_size:
+                raise ZeRORuntimeException("The checkpoint being loaded used a DP " \
+                    f"world size of {self.loaded_checkpoint_dp_world_size} but the " \
+                    f"current world size is {self.dp_world_size}. Automatic adjustment " \
+                    "of ZeRO's optimizer state partitioning with a new world size is not " \
+                    "currently supported.")
+            checkpoint_folder = None
+            zero_sd_list = self._get_all_zero_checkpoints(load_dir, tag)
+            if zero_sd_list is None:
+                return False
 
         self.optimizer.load_state_dict(
             state_dict_list=zero_sd_list,
             load_optimizer_states=load_optimizer_states,
             load_from_fp32_weights=self.zero_load_from_fp32_weights(),
-        )
-        logger.info(
-            f"loading {len(zero_sd_list)} zero partition checkpoints for rank {self.global_rank}"
-        )
+            checkpoint_folder=checkpoint_folder)
+
+        if self.load_universal_checkpoint():
+            logger.info(
+                f'loaded universal zero checkpoints from {checkpoint_folder} for rank {self.global_rank}'
+            )
+        else:
+            logger.info(
+                f"loading {len(zero_sd_list)} zero partition checkpoints for rank {self.global_rank}"
+            )
         return True
 
     def _get_mp_rank_zero_checkpoint_names(self,
@@ -2662,7 +2994,6 @@ class DeepSpeedEngine(Module):
             mp_rank=mp_rank,
             dp_world_size=self.loaded_checkpoint_dp_world_size,
             bf16_mode=bf16_mode)
-        invalid_zero_ckpt_paths = []
         for i, ckpt_name in enumerate(zero_ckpt_names):
             if not os.path.exists(ckpt_name):
                 # transparently handle the old file pattern for optim_states
@@ -2672,13 +3003,6 @@ class DeepSpeedEngine(Module):
                     if os.path.exists(ckpt_name_try):
                         zero_ckpt_names[i] = ckpt_name_try
                         continue
-                invalid_zero_ckpt_paths.append(ckpt_name)
-
-        if len(invalid_zero_ckpt_paths) > 0:
-            logger.warn(
-                f"The following zero checkpoints paths are missing: {invalid_zero_ckpt_paths}"
-            )
-            return None
 
         return zero_ckpt_names
 
@@ -2686,10 +3010,15 @@ class DeepSpeedEngine(Module):
         zero_sd_list = []
         for i, ckpt_name in enumerate(zero_ckpt_names):
             _state = None
+            if ckpt_name is None:
+                _state = {OPTIMIZER_STATE_DICT: None}
             # Fully load state for current rank
-            if self.zero_elastic_checkpoint() or dist.get_rank(
+            elif self.zero_elastic_checkpoint() or dist.get_rank(
                     group=self.optimizer.dp_process_group) == i:
-                _state = torch.load(ckpt_name, map_location='cpu')
+                _state = self.checkpoint_engine.load(
+                    ckpt_name,
+                    map_location='cpu',
+                )
             else:
                 _state = {OPTIMIZER_STATE_DICT: None}
             zero_sd_list.append(_state)
@@ -2724,8 +3053,8 @@ class DeepSpeedEngine(Module):
             bhash = torch.ByteTensor([s_hash.digest()]).flatten().to(self.device)
             max_bhash = bhash.clone()
             min_bhash = bhash.clone()
-            dist.all_reduce(max_bhash, op=torch.distributed.ReduceOp.MAX)
-            dist.all_reduce(min_bhash, op=torch.distributed.ReduceOp.MIN)
+            dist.all_reduce(max_bhash, op=dist.ReduceOp.MAX)
+            dist.all_reduce(min_bhash, op=dist.ReduceOp.MIN)
             valid = all(min_bhash == bhash) and all(max_bhash == bhash)
             msg = (
                 f"[rank={dist.get_rank()}] The checkpoint tag name '{tag}' is not consistent across "
@@ -2737,7 +3066,7 @@ class DeepSpeedEngine(Module):
                 logger.warning(msg)
 
     def save_checkpoint(self, save_dir, tag=None, client_state={}, save_latest=True):
-        r"""Save training checkpoint
+        """Save training checkpoint
 
         Arguments:
             save_dir: Required. Directory for saving the checkpoint
@@ -2745,28 +3074,31 @@ class DeepSpeedEngine(Module):
                 used if not provided. Tag name must be the same across all ranks.
             client_state: Optional. State dictionary used for saving required training states in the client code.
             save_latest: Optional. Save a file 'latest' pointing to the latest saved checkpoint.
-
         Important: all processes must call this method and not just the process with rank 0. It is
         because each process needs to save its master weights and scheduler+optimizer states. This
         method will hang waiting to synchronize with other processes if it's called just for the
         process with rank 0.
+
         """
         if self.zero_optimization_partition_weights():
             # Prepare for checkpoint save by ensuring all parameters are partitioned
             self.optimizer.checkpoint_event_prologue()
 
+        rank = self.local_rank if self.use_node_local_storage() else self.global_rank
+
         # This is to make sure the checkpoint names are created without collision
         # There seems to be issue creating them in parallel
 
         # Ensure save_dir directory exists
         os.makedirs(save_dir, exist_ok=True)
-        torch.distributed.barrier()
+        dist.barrier()
 
         if tag is None:
             tag = f"global_step{self.global_steps}"
 
         # Ensure tag is a string
         tag = str(tag)
+        self.checkpoint_engine.create(tag)
 
         # Ensure checkpoint tag is consistent across ranks
         self._checkpoint_tag_validation(tag)
@@ -2776,7 +3108,11 @@ class DeepSpeedEngine(Module):
             self._create_checkpoint_file(save_dir, tag, False)
             self._save_moe_checkpoint(save_dir, tag, client_state=client_state)
 
-        if self.save_non_zero_checkpoint:
+        # We distribute the task of saving layer checkpoint files among
+        # data parallel instances, so all procs should call _save_checkpoint.
+        # All procs then call module_state_dict(), but only procs of data
+        # parallel rank 0 save the general model params.
+        if not self.has_moe_layers:
             self._create_checkpoint_file(save_dir, tag, False)
             self._save_checkpoint(save_dir, tag, client_state=client_state)
 
@@ -2788,11 +3124,13 @@ class DeepSpeedEngine(Module):
             self.optimizer.checkpoint_event_epilogue()
 
         # Save latest checkpoint tag
-        torch.distributed.barrier()
-        if save_latest and self.global_rank == 0:
+        self.checkpoint_engine.commit(tag)
+        if save_latest and rank == 0:
             with open(os.path.join(save_dir, 'latest'), 'w') as fd:
                 fd.write(tag)
 
+        dist.barrier()
+
         return True
 
     def _get_non_moe_state_dict(self, full_state_dict):
@@ -2814,7 +3152,7 @@ class DeepSpeedEngine(Module):
         # Using layer_#_export_# to save the model's expert state_dict
         moe_layer_id = 0
         for n_module, module in self.module.named_modules():
-            if isinstance(module, MoE):  # and torch.distributed.get_rank() == 0:
+            if isinstance(module, MoE):  # and deepspeed.comm.get_rank() == 0:
                 group_name = module.expert_group_name
                 num_local_experts = module.num_local_experts
                 expp_rank = groups._get_expert_parallel_rank(group_name)
@@ -2846,8 +3184,9 @@ class DeepSpeedEngine(Module):
                         num_local_experts + int(local_expert_id)
                     expert_key = key.replace(f'{moe_str_prefix}{local_expert_id}',
                                              f'{moe_str_prefix}{global_expert_id}')
-                    experts_state_dict[str(
-                        global_expert_id)][expert_key] = moe_state_dict.pop(key)
+                    # truncating extra tensor (shared) storage
+                    truncated = moe_state_dict.pop(key).clone().detach()
+                    experts_state_dict[str(global_expert_id)][expert_key] = truncated
 
                 # let save the moe parameters
                 for global_expert_id, expert_state_dict in experts_state_dict.items():
@@ -2858,7 +3197,10 @@ class DeepSpeedEngine(Module):
                         global_expert_id,
                         tag,
                         self.mpu)
-                    torch.save(expert_state_dict, moe_save_path)
+                    if self.random_ltd_enabled():
+                        expert_state_dict = remove_random_ltd_state_dict(
+                            expert_state_dict)
+                    self.checkpoint_engine.save(expert_state_dict, moe_save_path)
                 moe_layer_id += 1
 
         self._curr_ckpt_path = os.path.join(save_dir, tag)
@@ -2879,9 +3221,9 @@ class DeepSpeedEngine(Module):
             self.optimizer.state_dict()
             if self.optimizer and not self.zero_optimization() else None
         }
-        with open(self._get_optimizer_ckpt_name(save_dir, tag, expp_rank), 'wb') as fd:
-            torch.save(optimizer_state, fd)
-            fd.flush()
+        # TODO: why use BufferedWriter not the path
+        file_path = self._get_optimizer_ckpt_name(save_dir, tag, expp_rank)
+        self.checkpoint_engine.save(optimizer_state, file_path)
 
         # get non-moe parameters
         model_state_dict = self._get_non_moe_state_dict(self.module_state_dict())
@@ -2894,6 +3236,13 @@ class DeepSpeedEngine(Module):
                 'lr_scheduler':
                 self.lr_scheduler.state_dict()
                 if self.lr_scheduler is not None else None,
+                'data_sampler':
+                self.training_dataloader.data_sampler.state_dict() if
+                (self.training_dataloader is not None
+                 and self.curriculum_learning_enabled()) else None,
+                'random_ltd':
+                self.random_ltd_scheduler.state_dict()
+                if self.random_ltd_enabled() else None,
                 'sparse_tensor_module_names':
                 self.sparse_tensor_module_names,
                 'skipped_steps':
@@ -2911,9 +3260,7 @@ class DeepSpeedEngine(Module):
             }
             state.update(client_state)
             logger.info(f'Saving model checkpoint: {save_path}')
-            with open(save_path, 'wb') as fd:
-                torch.save(state, fd)
-                fd.flush()
+            self.checkpoint_engine.save(state, save_path)
         self._curr_save_path = None
 
     def _create_checkpoint_file(self, save_dir, tag, zero_checkpoint):
@@ -2942,12 +3289,18 @@ class DeepSpeedEngine(Module):
     def _save_checkpoint(self, save_dir, tag, client_state={}):
 
         save_path = self._get_ckpt_name(save_dir, tag)
+
+        zero_optimizer_state = self.zero_optimization() or self.bfloat16_enabled()
+
         # A hack to save the checkpointing directory. Pipeline parallelism overrides
         # module_state_dict() and uses this path to save the model. module_state_dict()
-        # then instead just returns None.
+        # then instead just returns None.  The module_state_dict() implementation in
+        # PipelineEngine expects the save path to be set in self._curr_ckpt_path.
         self._curr_ckpt_path = os.path.join(save_dir, tag)
-        zero_optimizer_state = self.zero_optimization() or self.bfloat16_enabled()
-        state = dict(module=self.module_state_dict(),
+        module = self.module_state_dict()
+        self._curr_ckpt_path = None
+
+        state = dict(module=module,
                      buffer_names=self._get_buffer_names(),
                      optimizer=self.optimizer.state_dict()
                      if self.optimizer and not zero_optimizer_state else None,
@@ -2955,6 +3308,11 @@ class DeepSpeedEngine(Module):
                      if self.optimizer and zero_optimizer_state else None,
                      lr_scheduler=self.lr_scheduler.state_dict()
                      if self.lr_scheduler is not None else None,
+                     data_sampler=self.training_dataloader.data_sampler.state_dict() if
+                     (self.training_dataloader is not None
+                      and self.curriculum_learning_enabled()) else None,
+                     random_ltd=self.random_ltd_scheduler.state_dict()
+                     if self.random_ltd_enabled() else None,
                      sparse_tensor_module_names=self.sparse_tensor_module_names,
                      skipped_steps=self.skipped_steps,
                      global_steps=self.global_steps,
@@ -2965,9 +3323,9 @@ class DeepSpeedEngine(Module):
                      ds_version=version)
         state.update(client_state)
 
-        log_dist(message=f'Saving model checkpoint: {save_path}', ranks=[0, 1])
-        torch.save(state, save_path)
-        self._curr_save_path = None
+        if self.save_non_zero_checkpoint:
+            log_dist(message=f'Saving model checkpoint: {save_path}', ranks=[0, 1])
+            self.checkpoint_engine.save(state, save_path)
 
     def _get_buffer_names(self):
         buffer_names = []
@@ -2995,11 +3353,9 @@ class DeepSpeedEngine(Module):
         optimizer. the names are exactly as in state_dict. The order is absolutely important, since
         the saved data is just flattened data with no identifiers and requires reconstruction in the
         same order it was saved.
-
         We can't rely on self.module.named_parameters() to get the saved tensors, as some params
         will be missing and others unsaved and then it'd be impossible to reconstruct state_dict
         from the flattened weights.
-
         optimizer.bit16_groups seems to be the easiest to use as it's in all zeroX versions.
         """
         param_group_shapes = []
@@ -3049,9 +3405,8 @@ class DeepSpeedEngine(Module):
         zero_sd = dict(optimizer_state_dict=self.optimizer.state_dict(),
                        ds_config=self.config,
                        ds_version=version)
-        with open(zero_checkpoint_name, 'wb') as fd:
-            torch.save(zero_sd, fd)
-            fd.flush()
+        self.checkpoint_engine.save(zero_sd, zero_checkpoint_name)
+
         if self.global_rank == 0:
             self._copy_recovery_script(save_path)
         ckpt_type = 'zero' if self.zero_optimization() else 'bf16_zero'
@@ -3059,26 +3414,20 @@ class DeepSpeedEngine(Module):
 
     def _zero3_consolidated_16bit_state_dict(self):
         """
-
         Get a full non-partitioned state_dict with fp16 weights on cpu.
-
         Important: this function must be called on all ranks and not just rank 0.
-
         This is similar to nn.Module.state_dict (modelled after _save_to_state_dict), but:
-
         1. consolidates the weights from different partitions on gpu0
         2. works on one layer at a time to require as little gpu0 memory as possible, by
         moving the already consolidated weights to cpu
         3. takes care to keep the shared params shared when gradually copying the params to cpu
-
         Returns:
             a consolidated fp16 ``state_dict`` on cpu on rank 0, ``None`` on other ranks
-
         """
         if not self.zero_optimization_partition_weights():
             raise ValueError("this function requires ZeRO-3 mode")
 
-        state_dict = OrderedDict() if torch.distributed.get_rank() == 0 else None
+        state_dict = OrderedDict() if dist.get_rank() == 0 else None
         shared_params = {}
 
         def get_layer_state_dict(module, prefix=""):
@@ -3088,7 +3437,7 @@ class DeepSpeedEngine(Module):
             with deepspeed.zero.GatheredParameters(list(
                     module.parameters(recurse=False)),
                                                    modifier_rank=0):
-                if torch.distributed.get_rank() == 0:
+                if dist.get_rank() == 0:
                     # handle params
                     for name, param in module.named_parameters(recurse=False):
                         if param is None:
@@ -3134,7 +3483,8 @@ class DeepSpeedEngine(Module):
         return self.save_16bit_model(save_dir, save_filename)
 
     def save_16bit_model(self, save_dir, save_filename="pytorch_model.bin"):
-        r"""Save 16bit model weights
+        """
+        Save 16bit model weights
 
         This method saves the 16bit model weights at the desired destination.
 
@@ -3167,9 +3517,9 @@ class DeepSpeedEngine(Module):
         else:
             state_dict = self.module.state_dict()
 
-        if torch.distributed.get_rank() == 0:
+        if dist.get_rank() == 0:
             os.makedirs(save_dir, exist_ok=True)
             logger.info(f"Saving model weights to {path}")
-            torch.save(state_dict, path)
+            self.checkpoint_engine.save(state_dict, path)
 
         return True
diff --git a/deepspeed/runtime/fp16/__init__.py b/deepspeed/runtime/fp16/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..fcb45ab2b68516814a4bfbffebf2e01cbfefd527 100644
--- a/deepspeed/runtime/fp16/__init__.py
+++ b/deepspeed/runtime/fp16/__init__.py
@@ -0,0 +1 @@
+'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py
old mode 100644
new mode 100755
index dc52552aebba6852c5e31dff82d820c028596e30..4f4b5cfa7f2dbf5c088e4f91f5285bf9acd1062e
--- a/deepspeed/runtime/fp16/fused_optimizer.py
+++ b/deepspeed/runtime/fp16/fused_optimizer.py
@@ -8,14 +8,16 @@ This file is adapted from FP16_Optimizer in NVIDIA/apex
 import torch
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
+from deepspeed.runtime import DeepSpeedOptimizer
 from deepspeed.runtime.utils import get_global_norm, get_grad_norm, CheckOverflow, get_weight_norm
 from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE
 from deepspeed.utils import groups, logger, log_dist
+from deepspeed import comm as dist
 from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, CLIP_GRAD
-import torch.distributed as dist
+from deepspeed.accelerator import get_accelerator
 
 
-class FP16_Optimizer(object):
+class FP16_Optimizer(DeepSpeedOptimizer):
     """
    FP16 Optimizer for training fp16 models. Handles loss scaling.
 
@@ -40,8 +42,8 @@ class FP16_Optimizer(object):
         self.deepspeed = deepspeed
         self.has_moe_layers = has_moe_layers
         self.using_pipeline = self.deepspeed.pipeline_parallelism
-        if not torch.cuda.is_available:
-            raise SystemError("Cannot use fp16 without CUDA.")
+        if not get_accelerator().is_available():
+            raise SystemError("Cannot use fp16 without accelerator.")
         self.optimizer = init_optimizer
 
         # param flattened by groups
@@ -93,6 +95,9 @@ class FP16_Optimizer(object):
             self.cur_scale = static_loss_scale
         self.verbose = verbose
 
+        self.custom_loss_scaler = False
+        self.external_loss_scale = None
+
         self.clip_grad = clip_grad
         self.norm_type = 2
         self.step_count = 0
@@ -126,14 +131,14 @@ class FP16_Optimizer(object):
 
         return
 
-    def zero_grad(self, set_grads_to_None=True):
+    def zero_grad(self, set_to_none=False):
         """
         Zero FP16 parameter grads.
         """
         # For speed, set model fp16 grad to None by default
         for group in self.fp16_groups:
             for p in group:
-                if set_grads_to_None:
+                if set_to_none:
                     p.grad = None
                 else:
                     if p.grad is not None:
@@ -177,7 +182,7 @@ class FP16_Optimizer(object):
                                                      apply_scale=False)
 
         # Stash unscaled gradient norm
-        self._global_grad_norm = scaled_global_grad_norm / self.cur_scale
+        self._global_grad_norm = scaled_grad_norm / self.cur_scale
 
         # norm is in fact norm*cur_scale
         self.optimizer.step(grads=[[g] for g in grads_groups_flat],
@@ -206,6 +211,23 @@ class FP16_Optimizer(object):
         if self.timers is not None:
             self.timers.log(name_list)
 
+    def set_lr(self, lr):
+        """Set the learning rate."""
+        for param_group in self.optimizer.param_groups:
+            param_group["lr"] = lr
+
+    def get_lr(self):
+        """Return the current learning rate."""
+        return self.optimizer.param_groups[0]["lr"]
+
+    def override_loss_scale(self, loss_scale):
+        if loss_scale != self.external_loss_scale:
+            logger.info(
+                f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}'
+            )
+        self.custom_loss_scaler = True
+        self.external_loss_scale = loss_scale
+
     def step(self, closure=None):
         """
         Not supporting closure.
@@ -317,7 +339,7 @@ class FP16_Optimizer(object):
                                           dtype=torch.float)
         dist.all_reduce(scaled_norm_tensor, group=pg)
         all_groups_norm = scaled_norm_tensor.item()
-        #print(f"old = {all_groups_norm_old} and new = {all_groups_norm} at rank: {torch.distributed.get_rank()}")
+        #print(f"old = {all_groups_norm_old} and new = {all_groups_norm} at rank: {deepspeed.comm.get_rank()}")
         return all_groups_norm
 
     def unscale_and_clip_grads(self, grad_groups_flat, total_norm, apply_scale=True):
@@ -343,9 +365,12 @@ class FP16_Optimizer(object):
         2. scaled_loss = fp32_loss*loss_scale
         3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's fp16 leaves
         """
-
-        scaled_loss = (loss.float()) * self.cur_scale
-        scaled_loss.backward(create_graph=create_graph, retain_graph=retain_graph)
+        if self.custom_loss_scaler:
+            scaled_loss = self.external_loss_scale * loss
+            scaled_loss.backward()
+        else:
+            scaled_loss = (loss.float()) * self.cur_scale
+            scaled_loss.backward(create_graph=create_graph, retain_graph=retain_graph)
 
     def _update_scale(self, skip):
         if self.dynamic_loss_scale:
@@ -433,7 +458,7 @@ class FP16_Optimizer(object):
         will call ``model.load_state_dict()`` before
         ``fp16_optimizer_instance.load_state_dict()`` is called.
         Example::
-            model = torch.nn.Linear(D_in, D_out).cuda().half()
+            model = torch.nn.Linear(D_in, D_out).to(get_accelerator().device_name()).half()
             optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
             optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
             ...
@@ -472,6 +497,14 @@ class FP16_Optimizer(object):
     def __repr__(self):
         return repr(self.optimizer)
 
-    @property
-    def loss_scale(self):
-        return self.cur_scale
+    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
+    def _get_loss_scale(self):
+        if self.custom_loss_scaler:
+            return self.external_loss_scale
+        else:
+            return self.cur_scale
+
+    def _set_loss_scale(self, value):
+        self.loss_scaler.cur_scale = value
+
+    loss_scale = property(_get_loss_scale, _set_loss_scale)
diff --git a/deepspeed/runtime/fp16/loss_scaler.py b/deepspeed/runtime/fp16/loss_scaler.py
old mode 100644
new mode 100755
index 954d0ea6158507cbc2464b744b2c99903cff2f8b..58ab2ae96fe1398887f39b81b599b0418f9a5769
--- a/deepspeed/runtime/fp16/loss_scaler.py
+++ b/deepspeed/runtime/fp16/loss_scaler.py
@@ -37,6 +37,7 @@ class LossScalerBase:
     """
     def __init__(self, cur_scale):
         self.cur_scale = cur_scale
+        self.dynamic = False
 
     @property
     def loss_scale(self):
@@ -107,7 +108,8 @@ class DynamicLossScaler(LossScalerBase):
                  scale_window=1000,
                  min_scale=1,
                  delayed_shift=1,
-                 consecutive_hysteresis=False):
+                 consecutive_hysteresis=False,
+                 raise_error_at_min_scale=True):
         super(DynamicLossScaler, self).__init__(init_scale)
         self.cur_iter = 0
         self.last_overflow_iter = -1
@@ -117,6 +119,8 @@ class DynamicLossScaler(LossScalerBase):
         self.delayed_shift = delayed_shift
         self.cur_hysteresis = delayed_shift
         self.consecutive_hysteresis = consecutive_hysteresis
+        self.raise_error_at_min_scale = raise_error_at_min_scale
+        self.dynamic = True
 
     # `params` is a list / generator of torch.Variable
     def has_overflow_serial(self, params):
@@ -152,6 +156,10 @@ class DynamicLossScaler(LossScalerBase):
         if overflow:
             # self.cur_scale /= self.scale_factor
             if self.delayed_shift == 1 or self.cur_hysteresis == 1:
+                if (self.cur_scale == self.min_scale) and self.raise_error_at_min_scale:
+                    raise Exception(
+                        "Current loss scale already at minimum - cannot decrease scale anymore. Exiting run."
+                    )
                 self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_scale)
             else:
                 self.cur_hysteresis -= 1
@@ -166,6 +174,18 @@ class DynamicLossScaler(LossScalerBase):
         self.cur_iter += 1
 
 
+# Although loss scaling is only defined for fp16, yet for backwards compatibility
+# we still create a scaler for other dtypes (fp32, bf16) which does not perform any scaling.
+def CreateLossScaler(dtype, static_loss_scale, dynamic_scaling, dynamic_loss_args):
+    if dtype == torch.half and dynamic_scaling:
+        if dynamic_loss_args is None:
+            return DynamicLossScaler()
+        return DynamicLossScaler(**dynamic_loss_args)
+
+    loss_scale_value = static_loss_scale if dtype == torch.half else 1.0
+    return LossScaler(scale=loss_scale_value)
+
+
 ##############################################################
 # Example usage below here -- assuming it's in a separate file
 ##############################################################
diff --git a/deepspeed/runtime/fp16/onebit/__init__.py b/deepspeed/runtime/fp16/onebit/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..289769b423ced34cb6753df6bfc1dc6485e3c01d 100644
--- a/deepspeed/runtime/fp16/onebit/__init__.py
+++ b/deepspeed/runtime/fp16/onebit/__init__.py
@@ -0,0 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .adam import OnebitAdam
+from .lamb import OnebitLamb
+from .zoadam import ZeroOneAdam
diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py
index 5ce451d7676ffbe0d86410b4105620cf3fa3ffd2..5eb22fb64d73c3a716ac427f470e18e593b6f79f 100644
--- a/deepspeed/runtime/fp16/onebit/adam.py
+++ b/deepspeed/runtime/fp16/onebit/adam.py
@@ -3,12 +3,9 @@ Copyright 2020 The Microsoft DeepSpeed Team
 '''
 import types
 import torch
-import importlib
 import numpy as np
-import time
-import torch.distributed as dist
-
-from deepspeed.utils.logging import logger
+from deepspeed.accelerator import get_accelerator
+from deepspeed import comm as dist
 
 
 class OnebitAdam(torch.optim.Optimizer):
@@ -37,7 +34,7 @@ class OnebitAdam(torch.optim.Optimizer):
         cuda_aware (boolean, required): Set True if the underlying MPI implementation
             supports CUDA-Aware communication. (default: False)
         comm_backend_name (string, optional): Set to 'mpi' if needed. (default: 'nccl')
-    .. _Adam\: A Method for Stochastic Optimization:
+    .. _Adam\\: A Method for Stochastic Optimization:
         https://arxiv.org/abs/1412.6980
     .. _On the Convergence of Adam and Beyond:
         https://openreview.net/forum?id=ryQu7f-RZ
@@ -178,14 +175,14 @@ class OnebitAdam(torch.optim.Optimizer):
                                                             (self.size * self.divider)))
                     state['server_chunk_size'] = state[
                         'corrected_tensor_size'] // self.size
-                    torch.cuda.empty_cache()
+                    get_accelerator().empty_cache()
                     state['worker_error'] = torch.zeros(state['corrected_tensor_size'],
                                                         device=p.device)
                     state['server_error'] = torch.zeros(state['server_chunk_size'],
                                                         device=p.device)
-                    torch.cuda.empty_cache()
+                    get_accelerator().empty_cache()
                     self.adam_freeze_key = True
-                    if not self.initialize and torch.distributed.get_rank() == 0:
+                    if not self.initialize and dist.get_rank() == 0:
                         print("Cupy Buffers Initialized Successfully.")
 
                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
@@ -249,9 +246,7 @@ class OnebitAdam(torch.optim.Optimizer):
         if not self.initialize:
             self.adam_freeze_key = False
             self.initialize = True
-            print(
-                f"Finished the initialization step at rank {torch.distributed.get_rank()}"
-            )
+            print(f"Finished the initialization step at rank {dist.get_rank()}")
             return loss
 
         if self.adam_freeze_key is False:
@@ -282,7 +277,7 @@ class OnebitAdam(torch.optim.Optimizer):
                 state_dict['param_groups'][i].pop('exp_avg_mask')
         super().load_state_dict(state_dict)
         if self.state[self.param_groups[0]['params'][0]]['step'] < self.freeze_step:
-            if torch.distributed.get_rank() == 0:
+            if dist.get_rank() == 0:
                 print("Checkpoint loaded and OnebitAdam warmup stage starts/continues.")
             if self.adam_freeze_key is True:
                 self.adam_freeze_key = False
@@ -291,7 +286,7 @@ class OnebitAdam(torch.optim.Optimizer):
                 else:
                     self.deepspeed.enable_backward_allreduce = True
         else:
-            if torch.distributed.get_rank() == 0:
+            if dist.get_rank() == 0:
                 print(
                     "Checkpoint loaded and OnebitAdam compression stage starts/continues."
                 )
diff --git a/deepspeed/runtime/fp16/onebit/lamb.py b/deepspeed/runtime/fp16/onebit/lamb.py
index 01c6cd878488c73f0a08b6030622982e5fc45284..87c24695e23daffef851fbf7c30d435b167e083f 100644
--- a/deepspeed/runtime/fp16/onebit/lamb.py
+++ b/deepspeed/runtime/fp16/onebit/lamb.py
@@ -4,8 +4,9 @@ Copyright 2021 The Microsoft DeepSpeed Team
 import types
 import torch
 import numpy as np
-import torch.distributed as dist
+from deepspeed import comm as dist
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from deepspeed.accelerator import get_accelerator
 
 
 class OnebitLamb(torch.optim.Optimizer):
@@ -46,9 +47,9 @@ class OnebitLamb(torch.optim.Optimizer):
             coefficient during compression stage (default: 0.5)
         factor_threshold (float, optional): threshold of how much the scaling factor can
             fluctuate between steps (default: 0.1)
-    .. _Large Batch Optimization for Deep Learning\: Training BERT in 76 minutes:
+    .. _Large Batch Optimization for Deep Learning\\: Training BERT in 76 minutes:
         https://arxiv.org/abs/1904.00962
-    .. _Adam\: A Method for Stochastic Optimization:
+    .. _Adam\\: A Method for Stochastic Optimization:
         https://arxiv.org/abs/1412.6980
     .. _On the Convergence of Adam and Beyond:
         https://openreview.net/forum?id=ryQu7f-RZ
@@ -283,7 +284,7 @@ class OnebitLamb(torch.optim.Optimizer):
                 p.data = q.data
 
         if self.initialize and len(self.worker_errors) == 0:
-            torch.cuda.empty_cache()
+            get_accelerator().empty_cache()
             for i in range(len(self.exp_avg_flat)):
                 self.worker_errors.append(
                     torch.zeros(self.corrected_tensor_sizes[i],
@@ -291,21 +292,21 @@ class OnebitLamb(torch.optim.Optimizer):
                 self.server_errors.append(
                     torch.zeros(self.server_chunk_sizes[i],
                                 device=self.exp_avg_flat[i].device))
-            torch.cuda.empty_cache()
+            get_accelerator().empty_cache()
 
         if self.lamb_freeze_key:
             if self.size > 1:
                 for i in range(len(self.exp_avg_flat)):
                     if not self.initialize:
-                        torch.cuda.empty_cache()
+                        get_accelerator().empty_cache()
                         self.worker_errors.append(
                             torch.zeros(self.corrected_tensor_sizes[i],
                                         device=self.exp_avg_flat[i].device))
                         self.server_errors.append(
                             torch.zeros(self.server_chunk_sizes[i],
                                         device=self.exp_avg_flat[i].device))
-                        torch.cuda.empty_cache()
-                        if torch.distributed.get_rank() == 0:
+                        get_accelerator().empty_cache()
+                        if dist.get_rank() == 0:
                             print("Cupy Buffers Initialized Successfully.")
 
                         self.comm_backend_handle.compressed_allreduce(
@@ -314,7 +315,7 @@ class OnebitLamb(torch.optim.Optimizer):
                             self.server_errors[0],
                             self.deepspeed.local_rank)
 
-                        if torch.distributed.get_rank() == 0:
+                        if dist.get_rank() == 0:
                             print('Pop out errors', flush=True)
                         del self.worker_errors[:]
                         del self.server_errors[:]
@@ -389,9 +390,7 @@ class OnebitLamb(torch.optim.Optimizer):
         if not self.initialize:
             self.lamb_freeze_key = False
             self.initialize = True
-            print(
-                f"Finished the initialization step at rank {torch.distributed.get_rank()}"
-            )
+            print(f"Finished the initialization step at rank {dist.get_rank()}")
             return loss
 
         if self.lamb_freeze_key is False:
@@ -427,7 +426,7 @@ class OnebitLamb(torch.optim.Optimizer):
         del self.corrected_tensor_sizes[:]
         del self.server_chunk_sizes[:]
         if self.state[self.param_groups[0]['params'][0]]['step'] < self.freeze_step:
-            if torch.distributed.get_rank() == 0:
+            if dist.get_rank() == 0:
                 print("Checkpoint loaded and OnebitLamb warmup stage starts/continues.")
             if self.lamb_freeze_key is True:
                 self.lamb_freeze_key = False
@@ -442,7 +441,7 @@ class OnebitLamb(torch.optim.Optimizer):
                     if 'scaling_coeff' in self.state[p]:
                         self.state[p].pop('scaling_coeff')
         else:
-            if torch.distributed.get_rank() == 0:
+            if dist.get_rank() == 0:
                 print(
                     "Checkpoint loaded and OnebitLamb compression stage starts/continues."
                 )
diff --git a/deepspeed/runtime/fp16/onebit/zoadam.py b/deepspeed/runtime/fp16/onebit/zoadam.py
index b0238b1997f59d1874eabafff5661a71e74d1757..f86ae86f36cb5d7863c440a51ddafeb80cbafdd7 100644
--- a/deepspeed/runtime/fp16/onebit/zoadam.py
+++ b/deepspeed/runtime/fp16/onebit/zoadam.py
@@ -3,12 +3,9 @@ Copyright 2020 The Microsoft DeepSpeed Team
 '''
 import types
 import torch
-import importlib
 import numpy as np
-import time
-import torch.distributed as dist
-
-from deepspeed.utils.logging import logger
+from deepspeed.accelerator import get_accelerator
+from deepspeed import comm as dist
 
 
 class ZeroOneAdam(torch.optim.Optimizer):
@@ -47,7 +44,7 @@ class ZeroOneAdam(torch.optim.Optimizer):
         cuda_aware (boolean, required): Set True if the underlying MPI implementation
             supports CUDA-Aware communication. (default: False)
         comm_backend_name (string, optional): Set to 'mpi' if needed. (default: 'nccl')
-    .. _Adam\: A Method for Stochastic Optimization:
+    .. _Adam\\: A Method for Stochastic Optimization:
         https://arxiv.org/abs/1412.6980
     .. _On the Convergence of Adam and Beyond:
         https://openreview.net/forum?id=ryQu7f-RZ
@@ -189,16 +186,16 @@ class ZeroOneAdam(torch.optim.Optimizer):
                                                             (self.size * self.divider)))
                     state['server_chunk_size'] = state[
                         'corrected_tensor_size'] // self.size
-                    torch.cuda.empty_cache()
+                    get_accelerator().empty_cache()
                     state['worker_error'] = torch.zeros(state['corrected_tensor_size'],
                                                         device=p.device)
                     state['server_error'] = torch.zeros(state['server_chunk_size'],
                                                         device=p.device)
                     # Accumulation of momentum, i.e., the u variable in the 0/1 Adam paper
                     state['momentum_accumulator'] = torch.zeros_like(p.data)
-                    torch.cuda.empty_cache()
+                    get_accelerator().empty_cache()
                     # self.freeze_key = True
-                    if not self.initialize and torch.distributed.get_rank() == 0:
+                    if not self.initialize and dist.get_rank() == 0:
                         print("Cupy Buffers Initialized Successfully.")
 
                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
@@ -313,9 +310,7 @@ class ZeroOneAdam(torch.optim.Optimizer):
 
         if not self.initialize:
             self.initialize = True
-            print(
-                f"Finished the initialization step at rank {torch.distributed.get_rank()}"
-            )
+            print(f"Finished the initialization step at rank {dist.get_rank()}")
             return loss
 
         if self.state[self.param_groups[0]['params'][0]]['step'] > self.var_freeze_step:
diff --git a/deepspeed/runtime/fp16/unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py
old mode 100644
new mode 100755
index 56b880feb0add1a818a0ad93e1f3458a8117ace5..e0249f15a6fb884fc59acaffa678e8f2ec96d134
--- a/deepspeed/runtime/fp16/unfused_optimizer.py
+++ b/deepspeed/runtime/fp16/unfused_optimizer.py
@@ -9,13 +9,16 @@ from deepspeed.moe.utils import split_params_grads_into_shared_and_expert_params
 import torch
 from torch._utils import _flatten_dense_tensors
 
+from deepspeed.runtime import DeepSpeedOptimizer
 from deepspeed.runtime.utils import get_global_norm, CheckOverflow, get_weight_norm
 from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE
 from deepspeed.utils import logger
 from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT
+from deepspeed.accelerator import get_accelerator
+from deepspeed import comm as dist
 
 
-class FP16_UnfusedOptimizer(object):
+class FP16_UnfusedOptimizer(DeepSpeedOptimizer):
     """
     FP16 Optimizer without weight fusion to support LAMB optimizer
 
@@ -35,11 +38,11 @@ class FP16_UnfusedOptimizer(object):
         self.fused_lamb_legacy = fused_lamb_legacy
         self._global_grad_norm = 0.
 
-        if torch.distributed.get_rank() == 0:
+        if dist.get_rank() == 0:
             logger.info(f'Fused Lamb Legacy : {self.fused_lamb_legacy} ')
 
-        if not torch.cuda.is_available:
-            raise SystemError("Cannot use fp16 without CUDA.")
+        if not get_accelerator().is_available():
+            raise SystemError("Cannot use fp16 without accelerator.")
         self.optimizer = init_optimizer
 
         # param groups
@@ -84,6 +87,9 @@ class FP16_UnfusedOptimizer(object):
             self.cur_iter = 0
             self.cur_scale = static_loss_scale
 
+        self.custom_loss_scaler = False
+        self.external_loss_scale = None
+
         self.verbose = verbose
 
         self.clip_grad = clip_grad
@@ -105,7 +111,7 @@ class FP16_UnfusedOptimizer(object):
 
         self.initialize_optimizer_states()
 
-    def zero_grad(self, set_grads_to_None=True):
+    def zero_grad(self, set_to_none=False):
         """
         Zero FP16 parameter grads.
         """
@@ -113,7 +119,7 @@ class FP16_UnfusedOptimizer(object):
         # For speed, set model fp16 grad to None by default
         for group in self.fp16_groups:
             for p in group:
-                if set_grads_to_None:
+                if set_to_none:
                     p.grad = None
                 else:
                     if p.grad is not None:
@@ -182,6 +188,23 @@ class FP16_UnfusedOptimizer(object):
 
         return self.overflow
 
+    def set_lr(self, lr):
+        """Set the learning rate."""
+        for param_group in self.optimizer.param_groups:
+            param_group["lr"] = lr
+
+    def get_lr(self):
+        """Return the current learning rate."""
+        return self.optimizer.param_groups[0]["lr"]
+
+    def override_loss_scale(self, loss_scale):
+        if loss_scale != self.external_loss_scale:
+            logger.info(
+                f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}'
+            )
+        self.custom_loss_scaler = True
+        self.external_loss_scale = loss_scale
+
     def step(self, closure=None):
         """
         Not supporting closure.
@@ -260,9 +283,12 @@ class FP16_UnfusedOptimizer(object):
         2. scaled_loss = fp32_loss*loss_scale
         3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's fp16 leaves
         """
-        scaled_loss = (loss.float()) * self.cur_scale
-
-        scaled_loss.backward(create_graph=create_graph, retain_graph=retain_graph)
+        if self.custom_loss_scaler:
+            scaled_loss = self.external_loss_scale * loss
+            scaled_loss.backward()
+        else:
+            scaled_loss = (loss.float()) * self.cur_scale
+            scaled_loss.backward(create_graph=create_graph, retain_graph=retain_graph)
 
     def _update_scale(self, skip):
         if self.dynamic_loss_scale:
@@ -313,6 +339,18 @@ class FP16_UnfusedOptimizer(object):
 
     param_groups = property(_get_param_groups, _set_param_groups)
 
+    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
+    def _get_loss_scale(self):
+        if self.custom_loss_scaler:
+            return self.external_loss_scale
+        else:
+            return self.cur_scale
+
+    def _set_loss_scale(self, value):
+        self.loss_scaler.cur_scale = value
+
+    loss_scale = property(_get_loss_scale, _set_loss_scale)
+
     def state_dict(self):
         """
         Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
@@ -350,7 +388,7 @@ class FP16_UnfusedOptimizer(object):
         will call ``model.load_state_dict()`` before
         ``fp16_optimizer_instance.load_state_dict()`` is called.
         Example::
-            model = torch.nn.Linear(D_in, D_out).cuda().half()
+            model = torch.nn.Linear(D_in, D_out).to(get_accelerator().device_name()).half()
             optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
             optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
             ...
@@ -395,13 +433,13 @@ class FP16_UnfusedOptimizer(object):
             for param in group:
                 param.grad = torch.zeros(param.size(),
                                          dtype=param.dtype,
-                                         device=torch.cuda.current_device())
+                                         device=get_accelerator().current_device_name())
 
         for i, group in enumerate(self.fp32_groups):
             for param in group:
                 param.grad = torch.zeros(param.size(),
                                          dtype=param.dtype,
-                                         device=torch.cuda.current_device())
+                                         device=get_accelerator().current_device_name())
 
         self.optimizer.step()
 
diff --git a/deepspeed/runtime/lr_schedules.py b/deepspeed/runtime/lr_schedules.py
old mode 100644
new mode 100755
index b4dc749ae193c8874dc19c24019f75ae0a4b9b4b..faf5e6fee910514eefe33a5668ee0f2aa65b28e2
--- a/deepspeed/runtime/lr_schedules.py
+++ b/deepspeed/runtime/lr_schedules.py
@@ -10,9 +10,7 @@ https://github.com/pytorch/pytorch/blob/v1.1.0/torch/optim/lr_scheduler.py
 
 import argparse
 from torch.optim import Optimizer
-from typing import Union, List
 import math
-from deepspeed.runtime.constants import *
 from deepspeed.utils import logger
 
 LR_SCHEDULE = 'lr_schedule'
diff --git a/deepspeed/runtime/pipe/__init__.py b/deepspeed/runtime/pipe/__init__.py
index 55e4c2ca3277d5191cd50c0079c0ac02c19fa205..dcffaa6101b44ece10ad6e2d5a4941b95eb7ae5d 100644
--- a/deepspeed/runtime/pipe/__init__.py
+++ b/deepspeed/runtime/pipe/__init__.py
@@ -1,2 +1,4 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from .module import PipelineModule, LayerSpec, TiedLayerSpec
 from .topology import ProcessTopology
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 5807fb983ed7b6cf3dbb3cb1305c888f25b154c2..a080559b1a2aa065bc5916e32659b4e4a8242929 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -1,28 +1,19 @@
 # Copyright 2019 The Microsoft DeepSpeed Team
 
-import time
-import logging
-import copy
-import os
-
 from types import MethodType
 
-from numpy import prod
-
 import torch
-import torch.nn as nn
-import torch.optim as optim
-import torch.distributed as dist
+from deepspeed import comm as dist
 
-from deepspeed.utils.logging import logger
-from deepspeed.utils.timer import SynchronizedWallClockTimer, ThroughputTimer
+from deepspeed.utils import logger
+from deepspeed.utils.timer import ThroughputTimer
+from deepspeed.accelerator import get_accelerator
 
-from deepspeed.inference.engine import InferenceEngine
 from ..engine import DeepSpeedEngine, MEMORY_OPT_ALLREDUCE_SIZE
-from ..utils import PartitionedTensor, ensure_directory_exists
+from ..utils import PartitionedTensor
 from ..dataloader import RepeatingLoader
 
-from .module import PipelineModule, PipelineError, TiedLayerSpec
+from .module import PipelineModule, PipelineError
 from . import p2p
 from . import schedule
 
@@ -80,8 +71,10 @@ class PipelineEngine(DeepSpeedEngine):
         # used to disable the pipeline all-reduce when used with 1-bit Adam/1-bit LAMB
         self.pipeline_enable_backward_allreduce = True
 
-        assert not self.elasticity_enabled(), "Elasticity is not currently supported" \
-            " with pipeline parallelism."
+        if self.elasticity_enabled():
+            if not self.is_elastic_model_parallel_supported():
+                assert not self.elasticity_enabled(), "Elasticity is not currently supported" \
+                " with pipeline parallelism."
 
         # pipeline step for logging
         self.log_batch_step_id = -1
@@ -112,9 +105,7 @@ class PipelineEngine(DeepSpeedEngine):
 
         self._force_grad_boundary = False
 
-        self.batch_timer = ThroughputTimer(batch_size=self.micro_batch_size *
-                                           self.micro_batches,
-                                           num_workers=self.dp_world_size,
+        self.batch_timer = ThroughputTimer(batch_size=self.train_batch_size(),
                                            logging_fn=self.tput_log,
                                            monitor_memory=False,
                                            steps_per_output=self.steps_per_print())
@@ -190,6 +181,8 @@ class PipelineEngine(DeepSpeedEngine):
             self.module.activation_checkpoint_interval = self._config.pipeline[
                 'activation_checkpoint_interval']
 
+        self.module.checkpoint_parallel_write_pipeline = self._config.checkpoint_parallel_write_pipeline
+
         if self.is_last_stage():
             self.loss_model = self.module.loss_fn
 
@@ -328,13 +321,13 @@ class PipelineEngine(DeepSpeedEngine):
                 f'train_batch() requires gradients enabled. Use eval_batch() instead.')
 
         # Curriculum learning could change activation shape
-        if self.curriculum_enabled():
-            new_difficulty = self.curriculum_scheduler.update_difficulty( \
+        if self.curriculum_enabled_legacy():
+            new_difficulty = self.curriculum_scheduler_legacy.update_difficulty( \
                 self.global_steps + 1)
-            if self.global_steps == 0 or self.curriculum_scheduler.first_step:
+            if self.global_steps == 0 or self.curriculum_scheduler_legacy.first_step:
                 self.reset_activation_shape()
-                self.curriculum_scheduler.first_step = False
-            elif new_difficulty != self.curriculum_scheduler.get_difficulty( \
+                self.curriculum_scheduler_legacy.first_step = False
+            elif new_difficulty != self.curriculum_scheduler_legacy.get_difficulty( \
                 self.global_steps):
                 self.reset_activation_shape()
 
@@ -357,7 +350,7 @@ class PipelineEngine(DeepSpeedEngine):
 
         if self.global_steps % self.steps_per_print() == 0:
             if self.global_rank == 0:
-                elapsed = self.timers('train_batch').elapsed(reset=True)
+                elapsed = self.timers('train_batch').elapsed(reset=True) / 1000.0
                 iter_time = elapsed / self.steps_per_print()
                 tput = self.train_batch_size() / iter_time
                 print(f'steps: {self.global_steps} '
@@ -365,16 +358,12 @@ class PipelineEngine(DeepSpeedEngine):
                       f'iter time (s): {iter_time:0.3f} '
                       f'samples/sec: {tput:0.3f}')
 
-        # Tensorboard
-        if self.tensorboard_enabled():
-            if self.global_rank == 0:
-                self.summary_events = [(f'Train/Samples/train_loss',
-                                        self.agg_train_loss.mean().item(),
-                                        self.global_samples)]
-                for event in self.summary_events:  # write_summary_events
-                    self.summary_writer.add_scalar(event[0], event[1], event[2])
-                if self.global_steps % self.steps_per_print() == 0:
-                    self.summary_writer.flush()
+        # Monitoring
+        if self.global_rank == 0 and self.monitor.enabled:
+            self.summary_events = [(f'Train/Samples/train_loss',
+                                    self.agg_train_loss.mean().item(),
+                                    self.global_samples)]
+            self.monitor.write_events(self.summary_events)
 
         if self.wall_clock_breakdown(
         ) and self.global_steps % self.steps_per_print() == 0:
@@ -423,13 +412,13 @@ class PipelineEngine(DeepSpeedEngine):
         self.module.eval()
 
         # Curriculum learning could change activation shape
-        if self.curriculum_enabled():
-            new_difficulty = self.curriculum_scheduler.update_difficulty( \
+        if self.curriculum_enabled_legacy():
+            new_difficulty = self.curriculum_scheduler_legacy.update_difficulty( \
                 self.global_steps + 1)
-            if self.global_steps == 0 or self.curriculum_scheduler.first_step:
+            if self.global_steps == 0 or self.curriculum_scheduler_legacy.first_step:
                 self.reset_activation_shape()
-                self.curriculum_scheduler.first_step = False
-            elif new_difficulty != self.curriculum_scheduler.get_difficulty( \
+                self.curriculum_scheduler_legacy.first_step = False
+            elif new_difficulty != self.curriculum_scheduler_legacy.get_difficulty( \
                 self.global_steps):
                 self.reset_activation_shape()
 
@@ -445,6 +434,10 @@ class PipelineEngine(DeepSpeedEngine):
         sched = schedule.InferenceSchedule(micro_batches=self.micro_batches,
                                            stages=self.num_stages,
                                            stage_id=self.stage_id)
+
+        # prevent dead-lock with multiple evals sequence
+        dist.barrier()
+
         with torch.no_grad():
             self._exec_schedule(sched)
 
@@ -454,14 +447,11 @@ class PipelineEngine(DeepSpeedEngine):
         if compute_loss:
             eval_output = self._bcast_pipe_scalar(eval_output)
 
-        if self.tensorboard_enabled():
-            if self.global_rank == 0:
-                self.summary_events = [(f'Train/Samples/eval_loss',
-                                        eval_output.mean().item(),
-                                        self.global_samples)]
-                for event in self.summary_events:  # write_summary_events
-                    self.summary_writer.add_scalar(event[0], event[1], event[2])
-                self.summary_writer.flush()
+        if self.global_rank == 0 and self.monitor.enabled:
+            self.summary_events = [(f'Train/Samples/eval_loss',
+                                    eval_output.mean().item(),
+                                    self.global_samples)]
+            self.monitor.write_events(self.summary_events)
 
         # Restore the training iterator
         self.set_dataiterator(train_iterator)
@@ -590,6 +580,11 @@ class PipelineEngine(DeepSpeedEngine):
             self.data_iterator = iterator
 
     def set_batch_fn(self, fn):
+        """Execute a post-processing function on input data.
+
+        Args:
+            fn (function): The function to run.
+        """
         self.batch_fn = fn
 
     def is_gradient_accumulation_boundary(self):
@@ -690,9 +685,9 @@ class PipelineEngine(DeepSpeedEngine):
 
         # Optionally compute loss on the last device
         if self.is_last_stage():
-            if self._compute_loss and self.loss_model is not None:
+            if self._compute_loss and self.module.loss_fn is not None:
                 labels = self.pipe_buffers['labels'][buffer_id]
-                self.loss = self.loss_model(outputs, labels)
+                self.loss = self.module.loss_fn(outputs, labels)
             else:
                 # Some models just return loss from forward()
                 self.loss = outputs
@@ -1167,17 +1162,15 @@ class PipelineEngine(DeepSpeedEngine):
 
         self.mem_status('AFTER STEP')
 
-        if self.tensorboard_enabled():
-            if self.global_rank == 0:
-                self.summary_events = [(f'Train/Samples/lr',
-                                        self.get_lr()[0],
-                                        self.global_samples)]
-                if self.fp16_enabled() and hasattr(self.optimizer, 'cur_scale'):
-                    self.summary_events.append((f'Train/Samples/loss_scale',
-                                                self.optimizer.cur_scale,
-                                                self.global_samples))
-                for event in self.summary_events:  # write_summary_events
-                    self.summary_writer.add_scalar(event[0], event[1], event[2])
+        if self.global_rank == 0 and self.monitor.enabled:
+            self.summary_events = [(f'Train/Samples/lr',
+                                    self.get_lr()[0],
+                                    self.global_samples)]
+            if self.fp16_enabled() and hasattr(self.optimizer, 'cur_scale'):
+                self.summary_events.append((f'Train/Samples/loss_scale',
+                                            self.optimizer.cur_scale,
+                                            self.global_samples))
+            self.monitor.write_events(self.summary_events)
 
         if self.wall_clock_breakdown():
             self.timers('step_microstep').stop()
@@ -1278,14 +1271,14 @@ class PipelineEngine(DeepSpeedEngine):
         if print_rank != -1 and rank != print_rank:
             return
 
-        torch.cuda.synchronize()
+        get_accelerator().synchronize()
 
         if reset_max:
-            torch.cuda.reset_max_memory_cached()
-            torch.cuda.reset_max_memory_allocated()
+            get_accelerator().reset_max_memory_cached()
+            get_accelerator().reset_max_memory_allocated()
 
-        new_alloced = torch.cuda.memory_allocated()
-        new_cached = torch.cuda.memory_cached()
+        new_alloced = get_accelerator().memory_allocated()
+        new_cached = get_accelerator().memory_cached()
 
         delta_alloced = new_alloced - mem_alloced
         delta_cached = new_cached - mem_cached
@@ -1293,8 +1286,8 @@ class PipelineEngine(DeepSpeedEngine):
         mem_cached = new_cached
         mem_alloced = new_alloced
 
-        max_alloced = torch.cuda.max_memory_allocated()
-        max_cached = torch.cuda.max_memory_cached()
+        max_alloced = get_accelerator().max_memory_allocated()
+        max_cached = get_accelerator().max_memory_cached()
 
         # convert to GB for printing
         new_alloced /= 1024**3
@@ -1325,10 +1318,11 @@ class PipelineEngine(DeepSpeedEngine):
         assert self._curr_ckpt_path is not None, \
             "PipelineEngine expects module_state_dict() to be called from save_checkpoint()"
 
-        self.module.save_state_dict(self._curr_ckpt_path)
+        self.module.save_state_dict(self._curr_ckpt_path,
+                                    checkpoint_engine=self.checkpoint_engine)
         return None
 
-    def load_module_state_dict(self, state_dict, strict=True):
+    def load_module_state_dict(self, state_dict, strict=True, custom_load_fn=None):
         """Override hack to instead use a directory path.
 
         This is important because pipeline models checkpoint by layer instead of rank.
@@ -1339,12 +1333,14 @@ class PipelineEngine(DeepSpeedEngine):
             state_dict (str, None): unused
             strict (bool, optional): Strict state loading. Defaults to True.
         """
-
+        assert custom_load_fn is None, "custom_load_fn not supported w. pipeline parallelism"
         if (state_dict is not None) and (not isinstance(state_dict, str)):
             super().load_module_state_dict(state_dict, strict)
             return
 
-        self.module.load_state_dir(load_dir=self._curr_ckpt_path, strict=strict)
+        self.module.load_state_dir(load_dir=self._curr_ckpt_path,
+                                   strict=strict,
+                                   checkpoint_engine=self.checkpoint_engine)
 
     # A map of PipeInstruction types to methods. Each method will be executed with the
     # kwargs provided to the PipeInstruction from the scheduler.
@@ -1378,11 +1374,3 @@ class PipelineEngine(DeepSpeedEngine):
                 # Equivalent to: self._exec_forward_pass(buffer_id=0)
                 self._exec_instr = MethodType(self._INSTRUCTION_MAP[type(cmd)], self)
                 self._exec_instr(**cmd.kwargs)
-
-    def set_batch_fn(self, fn):
-        """Execute a post-processing function on input data.
-
-        Args:
-            fn (function): The function to run.
-        """
-        self.batch_fn = fn
diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py
index 4121a0f8fd3f8d5017d33ca46f2b731fba248343..acf066bb710c68eb157cb197ff99b7ecc845ef33 100644
--- a/deepspeed/runtime/pipe/module.py
+++ b/deepspeed/runtime/pipe/module.py
@@ -1,21 +1,22 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import os
 import glob
-import enum
 
 import re as regex
 
-from collections import defaultdict
 from functools import partial
 
 import torch
 import torch.nn as nn
-import torch.distributed as dist
+from deepspeed import comm as dist
 
 from deepspeed.utils import logger
 from .. import utils as ds_utils
 from ..activation_checkpointing import checkpointing
 from .topology import PipeDataParallelTopology, PipelineParallelGrid
 from deepspeed.runtime.state_dict_factory import SDLoaderFactory
+from deepspeed.accelerator import get_accelerator
 
 
 class PipelineError(Exception):
@@ -85,6 +86,40 @@ class TiedLayerSpec(LayerSpec):
 
 
 class PipelineModule(nn.Module):
+    """Modules to be parallelized with pipeline parallelism.
+
+    The key constraint that enables pipeline parallelism is the
+    representation of the forward pass as a sequence of layers
+    and the enforcement of a simple interface between them. The
+    forward pass is implicitly defined by the module ``layers``. The key
+    assumption is that the output of each layer can be directly fed as
+    input to the next, like a ``torch.nn.Sequence``. The forward pass is
+    implicitly:
+
+    .. code-block:: python
+
+        def forward(self, inputs):
+            x = inputs
+            for layer in self.layers:
+                x = layer(x)
+            return x
+
+    .. note::
+        Pipeline parallelism is not compatible with ZeRO-2 and ZeRO-3.
+
+    Args:
+        layers (Iterable): A sequence of layers defining pipeline structure. Can be a ``torch.nn.Sequential`` module.
+        num_stages (int, optional): The degree of pipeline parallelism. If not specified, ``topology`` must be provided.
+        topology (``deepspeed.runtime.pipe.ProcessTopology``, optional): Defines the axes of parallelism axes for training. Must be provided if ``num_stages`` is ``None``.
+        loss_fn (callable, optional): Loss is computed ``loss = loss_fn(outputs, label)``
+        seed_layers(bool, optional): Use a different seed for each layer. Defaults to False.
+        seed_fn(type, optional): The custom seed generating function. Defaults to random seed generator.
+        base_seed (int, optional): The starting seed. Defaults to 1234.
+        partition_method (str, optional): The method upon which the layers are partitioned. Defaults to 'parameters'.
+        activation_checkpoint_interval (int, optional): The granularity activation checkpointing in terms of number of layers. 0 disables activation checkpointing.
+        activation_checkpoint_func (callable, optional): The function to use for activation checkpointing. Defaults to ``deepspeed.checkpointing.checkpoint``.
+        checkpointable_layers(list, optional): Checkpointable layers may not be checkpointed. Defaults to None which does not additional filtering.
+    """
     def __init__(self,
                  layers,
                  num_stages=None,
@@ -97,37 +132,6 @@ class PipelineModule(nn.Module):
                  activation_checkpoint_interval=0,
                  activation_checkpoint_func=checkpointing.checkpoint,
                  checkpointable_layers=None):
-        """Modules to be parallelized with pipeline parallelism.
-
-        The key constraint that enables pipeline parallelism is the
-        representation of the forward pass as a sequence of layers
-        and the enforcement of a simple interface between them. The
-        forward pass is implicitly defined by the module ``layers``. The key
-        assumption is that the output of each layer can be directly fed as
-        input to the next, like a ``torch.nn.Sequence``. The forward pass is
-        implicitly:
-
-        .. code-block:: python
-
-            def forward(self, inputs):
-                x = inputs
-                for layer in self.layers:
-                    x = layer(x)
-                return x
-
-        .. note::
-            Pipeline parallelism is not compatible with ZeRO-2 and ZeRO-3.
-
-        Args:
-            layers (Iterable): A sequence of layers defining pipeline structure. Can be a ``torch.nn.Sequential`` module.
-            num_stages (int, optional): The degree of pipeline parallelism. If not specified, ``topology`` must be provided.
-            topology (``deepseed.runtime.pipe.ProcessTopology``, optional): Defines the axes of parallelism axes for training. Must be provided if ``num_stages`` is ``None``.
-            loss_fn (callable, optional): Loss is computed ``loss = loss_fn(outputs, label)``
-            base_seed (int, optional): [description]. Defaults to 1234.
-            partition_method (str, optional): [description]. Defaults to 'parameters'.
-            activation_checkpoint_interval (int, optional): The granularity activation checkpointing in terms of number of layers. 0 disables activation checkpointing.
-            activation_checkpoint_func (callable, optional): The function to use for activation checkpointing. Defaults to ``deepspeed.checkpointing.checkpoint``.
-        """
 
         super().__init__()
 
@@ -194,12 +198,12 @@ class PipelineModule(nn.Module):
         self.tied_weight_attrs = {}
 
         # Offset the random seed by the stage ID.
-        #newseed = torch.cuda.initial_seed() + self._grid.get_stage_id()
+        #newseed = get_accelerator().initial_seed() + self._grid.get_stage_id()
         #ds_utils.set_random_seed(newseed)
 
-        #with torch.random.fork_rng(devices=[torch.cuda.current_device()]):
+        #with torch.random.fork_rng(devices=[get_accelerator().current_device_name()]):
         self._build()
-        self.to(f'cuda:{self.local_rank}')
+        self.to(get_accelerator().device_name(self.local_rank))
 
         self.tied_comms = self._index_tied_modules()
         self._synchronize_tied_weights()
@@ -563,14 +567,29 @@ class PipelineModule(nn.Module):
         ckpt_files.sort()
         return ckpt_files
 
-    def save_state_dict(self, save_dir):
-        if self._grid.data_parallel_id != 0:
-            return
+    def save_state_dict(self, save_dir, checkpoint_engine):
+        # Processes having the same model parallel rank on different data parallel instances
+        # have identical layer weights.  We can distribute the task of saving the layer weights
+        # among the data parallel ranks.  For example, if a pipeline stage has 9 layers and
+        # if there are 2 data parallel instances, rank 0 will save the first 5 layers and
+        # rank 1 will save the last 4.
+        dp_rank = self._grid.data_parallel_id
+        dp_size = self._grid.data_parallel_size
+        num_layers = len(self.forward_funcs)
+        if self.checkpoint_parallel_write_pipeline:
+            # spread layers evenly across data parallel ranks
+            offsets = ds_utils.partition_uniform(num_layers, dp_size)
+            start, end = offsets[dp_rank], offsets[dp_rank + 1]
+        else:
+            # data parallel rank 0 writes all layers
+            if dp_rank != 0:
+                return
+            start, end = 0, num_layers
+        layer_list = self.forward_funcs[start:end]
 
         os.makedirs(save_dir, exist_ok=True)
-        layer_offset = self._local_start
-        for idx, layer in enumerate(self.forward_funcs):
-            model_ckpt_path = self.ckpt_layer_path(save_dir, idx)
+        for idx, layer in enumerate(layer_list):
+            model_ckpt_path = self.ckpt_layer_path(save_dir, start + idx)
             if not hasattr(layer, 'state_dict'):
                 continue
             # We pass cloned tensors to torch.save() to avoid checkpoint bloat which occurs because torch.save()
@@ -584,9 +603,9 @@ class PipelineModule(nn.Module):
                 {k: v.clone()
                  for k,
                  v in orig_state_dict.items()})
-            torch.save(final_state_dict, model_ckpt_path)
+            checkpoint_engine.save(final_state_dict, model_ckpt_path)
 
-    def load_state_dir(self, load_dir, strict=True):
+    def load_state_dir(self, load_dir, checkpoint_engine, strict=True):
         for idx, layer in enumerate(self.forward_funcs):
             # Functions, etc. will not have state_dicts
             if not hasattr(layer, 'load_state_dict'):
@@ -597,7 +616,10 @@ class PipelineModule(nn.Module):
             mp_rank = self._grid.get_slice_parallel_rank()
             mp_world_size = self._grid.get_slice_parallel_world_size()
 
-            sd_loader = SDLoaderFactory.get_sd_loader(model_ckpt_list, version=2.0)
+            sd_loader = SDLoaderFactory.get_sd_loader(
+                model_ckpt_list,
+                version=2.0,
+                checkpoint_engine=checkpoint_engine)
             load_path, checkpoint, _ = sd_loader.load(mp_world_size, mp_rank, module_key=None, is_pipe_parallel=True)
 
             layer.load_state_dict(checkpoint)
diff --git a/deepspeed/runtime/pipe/p2p.py b/deepspeed/runtime/pipe/p2p.py
index 8a1b7192600680a349f4c8697f70b5af73b49a1f..93c61114c558548b6bad259e7ce07f13f952b796 100644
--- a/deepspeed/runtime/pipe/p2p.py
+++ b/deepspeed/runtime/pipe/p2p.py
@@ -6,11 +6,12 @@ import pickle
 import typing
 
 import torch
-import torch.distributed as dist
+from deepspeed import comm as dist
 
 # To query whether we have send/recv support
 from packaging.version import Version
 from deepspeed.git_version_info import torch_info
+from deepspeed.accelerator import get_accelerator
 
 _groups = None
 _grid = None
@@ -25,7 +26,7 @@ def can_send_recv() -> bool:
 
 
 #initializes adjacent process groups
-#run this only after torch.distributed.init_process_group() has been called
+#run this only after deepspeed.init_distributed() has been called
 def init_process_groups(grid):
     global _groups, _grid
     _grid = grid
@@ -92,7 +93,7 @@ def wait():
         op.wait()
     _async = []
 
-    torch.cuda.synchronize()
+    get_accelerator().synchronize()
 
 
 def send_obj(msg: typing.Any, dest: int):
@@ -110,10 +111,12 @@ def send_obj(msg: typing.Any, dest: int):
     # serialize the message
     msg = pickle.dumps(msg)
     # construct a tensor to send
-    msg = torch.ByteTensor(torch.ByteStorage.from_buffer(msg)).cuda()
+    msg = torch.ByteTensor(torch.ByteStorage.from_buffer(msg)).to(
+        get_accelerator().device_name())
 
     # Send meta and message
-    length_tensor = torch.tensor([len(msg)], dtype=torch.long).cuda()
+    length_tensor = torch.tensor([len(msg)],
+                                 dtype=torch.long).to(get_accelerator().device_name())
     dist.send(length_tensor, dst=dest)
     dist.send(msg, dst=dest)
 
@@ -128,11 +131,12 @@ def recv_obj(sender: int) -> typing.Any:
         sender (int): The rank sending the message.
     """
     # Get message meta
-    length = torch.tensor([0], dtype=torch.long).cuda()
+    length = torch.tensor([0], dtype=torch.long).to(get_accelerator().device_name())
     dist.recv(length, src=sender)
 
     # Receive and deserialize
-    msg = torch.empty(length.item(), dtype=torch.uint8).cuda()
+    msg = torch.empty(length.item(),
+                      dtype=torch.uint8).to(get_accelerator().device_name())
     dist.recv(msg, src=sender)
 
     msg = pickle.loads(msg.cpu().numpy().tobytes())
@@ -140,7 +144,7 @@ def recv_obj(sender: int) -> typing.Any:
     def _to(x):
         """Recursively move to the current device."""
         if torch.is_tensor(x):
-            return x.cuda()
+            return x.to(get_accelerator().device_name())
         if isinstance(x, (tuple, list)):
             ret = [_to(x_) for x_ in x]
             if isinstance(x, tuple):
diff --git a/deepspeed/runtime/pipe/schedule.py b/deepspeed/runtime/pipe/schedule.py
index 181d43e8062a5901b81661a1cd07b3a74803f1e3..19e73da17fb96dd01f5ae25fa62b906d2a805ea2 100644
--- a/deepspeed/runtime/pipe/schedule.py
+++ b/deepspeed/runtime/pipe/schedule.py
@@ -1,3 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from ..utils import call_to_str
 
 from abc import ABC, abstractmethod
diff --git a/deepspeed/runtime/pipe/topology.py b/deepspeed/runtime/pipe/topology.py
index 240c973a3fc1efb19622364ce5edfc3bf7bc926f..6c0cd96440fbf9a785c556b1ebcb6cc4fa97073c 100644
--- a/deepspeed/runtime/pipe/topology.py
+++ b/deepspeed/runtime/pipe/topology.py
@@ -1,9 +1,6 @@
 # Copyright 2019 The Microsoft DeepSpeed Team
 
-from deepspeed.utils import logger
-
-import torch.distributed as dist
-import sys
+from deepspeed import comm as dist
 
 from collections import namedtuple
 from itertools import product as cartesian_product
@@ -58,7 +55,7 @@ class ProcessTopology:
             raise ValueError('get_rank() does not support slices. Use filter_match())')
 
         key = self.ProcessCoord(**coord_kwargs)
-        assert key in self.mapping, f'key {kwargs} invalid'
+        assert key in self.mapping, f'key {coord_kwargs} invalid'
         return self.mapping[key]
 
     def get_axis_names(self):
diff --git a/deepspeed/runtime/progressive_layer_drop.py b/deepspeed/runtime/progressive_layer_drop.py
old mode 100644
new mode 100755
index 41c08cfd9e7cd69abb7dba73f3233c181f895d3a..65e1a563096d955161fbde1c8e1b344030354d42
--- a/deepspeed/runtime/progressive_layer_drop.py
+++ b/deepspeed/runtime/progressive_layer_drop.py
@@ -1,3 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import numpy as np
 from deepspeed.utils import log_dist
 
diff --git a/deepspeed/runtime/quantize.py b/deepspeed/runtime/quantize.py
old mode 100644
new mode 100755
index 05fc50201b77abf587bc20f71f94e9768fbc2481..81a7bd56991beed3937f487a54465800ca26b6e2
--- a/deepspeed/runtime/quantize.py
+++ b/deepspeed/runtime/quantize.py
@@ -1,20 +1,15 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import torch
 import math
-from deepspeed.utils import log_dist
 from deepspeed.utils import logger
 from deepspeed.ops.quantizer import ds_quantizer
 
-# number of 2-dimensional parameters in a layer
-# this is set for transformer-based models
 TWO_D_PARAMS = 6
 
 
 class Quantizer(object):
     def __init__(self,
-                 q_target_bits=8,
-                 q_start_bits=16,
-                 q_period=100,
-                 q_offset=100,
                  q_groups=1,
                  q_mixed_fp16=False,
                  q_change_ratio=0.01,
@@ -25,17 +20,11 @@ class Quantizer(object):
                  use_quantizer_kernel=False,
                  layer_num=0):
 
-        self.q_target_bits = q_target_bits
-
-        self.q_start_bits = [q_start_bits] * (layer_num if layer_num != 0 else 1)
-        self.q_period = [q_period] * (layer_num if layer_num != 0 else 1)
-        self.q_offset = q_offset
         self.q_groups = q_groups
         self.q_mixed_fp16 = q_mixed_fp16
         self.q_change_ratio = q_change_ratio
         self.q_type = q_type
         self.qsteps = 0
-        self.q_init_period = q_period
         self.quantize_real_ratio = 1.000
         self.q_verbose = q_verbose
         self.q_eigenvalue = q_eigenvalue
@@ -44,6 +33,7 @@ class Quantizer(object):
         self.layer_num = layer_num
 
     def any_precision_switch(self):
+        # Temporary disabled functionality
         if self.layer_num == 0:
             return True
         result = False
@@ -70,54 +60,69 @@ class Quantizer(object):
 
         for i in range(len(parameter_group)):
             for p in parameter_group[i]:
-                if len(p.size()) > 1:
+                if len(p.size()) > 1 and hasattr(p, "start_bits") and p.start_bits:
                     param_id = id(p)
-                    eigenvalue, layer_id = block_eigenvalue[param_id] if param_id in block_eigenvalue else (None, 0)
+                    if block_eigenvalue is None:
+                        eigenvalue, layer_id = None, 0
+                    else:
+                        eigenvalue, layer_id = block_eigenvalue[param_id] if param_id in block_eigenvalue else (None, 0)
                     if eigenvalue is not None:
                         factor = 1 + math.floor(eigenvalue * 4)
                         p.data = self.compute_quantization(p.data, layer_id, factor)
                     else:
-                        p.data = self.compute_quantization(p.data, layer_id)
+                        p.data = self.compute_quantization(p, layer_id)
 
     def step(self):
-        self.qsteps += (TWO_D_PARAMS * (self.layer_num if self.layer_num != 0 else 1))
+        self.qsteps += 1
+
+    def quantize_highbit(self, inputs, num_bits):
+
+        q_range = 2**num_bits
+        input_flat = inputs.reshape(self.q_groups, -1)
+        g_min = input_flat.amin(dim=-1, keepdim=True)
+        g_max = input_flat.amax(dim=-1, keepdim=True)
 
-    def sr_quantize(self, input_flat, input_g, scale):
         # Random number generator (Uniform)
-        p = torch.cuda.FloatTensor(input_flat.size(),
-                                   device=input_flat.device).uniform_()
-        p = torch.split(p, p.size(0) // self.q_groups)
-        add_s = torch.zeros_like(input_flat)
-        add_s = torch.split(add_s, add_s.size(0) // self.q_groups)
-
-        scale = [q_range / (2 * max(g.max(), g.min().abs())) for g in input_g]
-        # Quantize with INT rounding
-        input_flat = [(g * s).int().float() / s for (g, s) in zip(input_g, scale)]
-        # Compute the error
-        error = [((g - q).abs() / s) for (g, s, q) in zip(input_g, scale, input_flat)]
-        # Stochastic Rounding
-        add_s = [
-            a_s.masked_fill_(pg < err_g,
-                             1 / s) for (a_s,
-                                         pg,
-                                         err_g,
-                                         s) in zip(add_s,
-                                                   p,
-                                                   error,
-                                                   scale)
-        ]
-        add_s = [
-            a_s * (g > 0).float() - a_s * (g < 0).float() for a_s,
-            g in zip(add_s,
-                     input_flat)
-        ]
-        input_flat = [((q + a_s) * s).clamp(-(q_range >> 1),
-                                            (q_range >> 1) - 1) / s for q,
-                      a_s,
-                      s in zip(input_flat,
-                               add_s,
-                               scale)]
-        return input_flat
+        if self.q_rounding == 'nearest':
+            p = 0.
+        else:
+            p = input_flat.new(input_flat.shape).uniform_(-0.5, 0.5)
+
+        if self.q_type == 'symmetric':
+            scale = 2 * torch.max(torch.abs(g_min), torch.abs(g_max)) / q_range
+            zero_point = 0.
+            input_flat = (input_flat / scale + p).round().clamp(
+                -(q_range >> 1),
+                (q_range >> 1) - 1) * scale
+        elif self.q_type == 'asymmetric':
+            scale = (g_max - g_min) / q_range
+            zero_point = (g_min / scale).round() * scale
+            input_flat = ((input_flat - zero_point) / scale + p).round().clamp(
+                0,
+                (q_range - 1)) * scale + zero_point
+        output = input_flat.reshape(inputs.shape).contiguous()
+        return output
+
+    def quantize_tenary(self, inputs):
+        input_flat = inputs.reshape(self.q_groups, -1)
+        n = input_flat.shape[1]
+        m = input_flat.norm(p=1, dim=1).div(n)
+        thres = (0.7 * m).view(-1, 1)  #.expand_as(input_flat)
+        pos = (input_flat > thres).type(inputs.type())
+        neg = (input_flat < -thres).type(inputs.type())
+        mask = (input_flat.abs() > thres).type(inputs.type())
+        alpha = ((mask * input_flat).abs().sum(dim=1) / mask.sum(dim=1)).view(-1, 1)
+        output = alpha * pos - alpha * neg
+        output = output.reshape(inputs.shape).contiguous()
+        return output
+
+    def quantize_binary(self, inputs):
+        input_flat = inputs.reshape(self.q_groups, -1)
+        n = input_flat.shape[1]
+        m = input_flat.norm(p=1, dim=1, keepdim=True).div(n)
+        output = input_flat.sign().mul(m)
+        output = output.reshape(inputs.shape).contiguous()
+        return output
 
     def mixed_fp16_quantize(self, input, input_q, index):
         if self.q_mixed_fp16 and self.q_start_bits[index] >= (self.q_target_bits - 1):
@@ -131,90 +136,49 @@ class Quantizer(object):
         # when reducing 1 bit at each period, we increase the period
         # to go slowly toward the target quantization bits
         # the period and starting bit can be configured
-        if self.q_offset > 0:
-            if self.qsteps >= self.q_offset:
-                self.q_offset = 0
-                self.qsteps = 0
-            else:
-                return input
 
-        if self.q_start_bits[index] != self.q_target_bits:
-            if self.qsteps >= self.q_period[index]:
+        if input.start_bits != input.target_bits:
+            if self.qsteps >= input.q_period:
                 self.quantize_real_ratio = 1.0
-                if self.q_eigenvalue:
-                    self.q_period[index] <<= 1
-                    self.q_period[index] *= factor
-                    self.q_start_bits[index] -= 1
-                else:
-                    for i in range(len(self.q_start_bits)):
-                        self.q_start_bits[i] -= 1
-                        self.q_period[i] <<= 1
+                input.q_period <<= 1
+                input.q_period *= factor
+                input.start_bits -= 1
                 if self.q_verbose:
                     logger.info(
-                        f'Quantization settings: current bit-precision = {self.q_start_bits[index]}, step = {self.qsteps}, quantization period = {self.q_period[index]}, index = {index}'
+                        f'Quantization settings: current bit-precision = {input.start_bits}, step = {self.qsteps}, quantization period = {input.q_period}, index = {index}'
                     )
-        assert (self.q_start_bits[index] >= self.q_target_bits), \
+        assert (input.start_bits >= input.target_bits), \
             'Quantization bit is lower than target precision bits!'
 
-        # quantize the weights base on the selected bits and the value-range
-        if not self.use_quantizer_kernel:
-            q_range = 2**self.q_start_bits[index]
-            input_flat = input.view(-1)
-            input_g = torch.split(input_flat, input_flat.size(0) // self.q_groups)
-        if self.q_type == 0:  #symmetric
-            if self.use_quantizer_kernel:
-                input_q = ds_quantizer(input.clone(),
-                                       self.q_groups,
-                                       self.q_start_bits[index])
-            else:
-                scale = [q_range / (2 * max(g.max(), g.min().abs())) for g in input_g]
-                if self.q_rounding == 0:  # Nearest value rounding
-                    input_flat = [(g * s).round().clamp(-(q_range >> 1),
-                                                        (q_range >> 1) - 1) / s for g,
-                                  s in zip(input_g,
-                                           scale)]
-                else:  # Stochastic Rounding
-                    if self.use_quantizer_kernel:
-                        input_q = ds_quantizer(input.clone(),
-                                               self.q_groups,
-                                               self.q_start_bits[index],
-                                               sr=True)
-                    else:
-                        input_flat = self.sr_quantize(input_flat, input_g)
-        else:  #asymmetric
-            if self.q_rounding == 0:
-                if self.use_quantizer_kernel:
-                    input_q = ds_quantizer(input.clone(),
-                                           self.q_groups,
-                                           self.q_start_bits[index],
-                                           asym=True)
-                else:
-                    scale = [(g.max() - g.min()) / q_range for g in input_g]
-                    input_flat = [
-                        ((g - g.min()) / s).round().clamp(0,
-                                                          (q_range - 1)) * s + g.min()
-                        for g,
-                        s in zip(input_g,
-                                 scale)
-                    ]
-            else:
-                input_q = ds_quantizer(input.clone(),
-                                       self.q_groups,
-                                       self.q_start_bits[index],
-                                       asym=True)
-
-        if self.use_quantizer_kernel or (self.q_type and self.q_rounding):
-            return self.mixed_fp16_quantize(input, input_q, index)
+        if self.use_quantizer_kernel:
+            if input.start_bits <= 2:
+                raise ValueError(
+                    'Quantization bit is too low, please do it without quantization kernel!'
+                )
+            input_q = ds_quantizer(
+                input.data.clone(),
+                self.q_groups,
+                input.start_bits,
+                asym=False if self.q_type == 'symmetric' else True,
+                sr=False if self.q_rounding == 'nearest_neighbor' else True)
         else:
-            if self.q_mixed_fp16 and self.q_start_bits[index] >= (self.q_target_bits -
-                                                                  1):
-                input_flat = [(self.quantize_real_ratio * g) +
-                              ((1 - self.quantize_real_ratio) * g_q) for g,
-                              g_q in zip(input_g,
-                                         input_flat)]
-            input_q = torch.cat(input_flat)
-            input_q = input_q.reshape(input.size())
-            return input_q
+            if input.start_bits >= 3:
+                input_flat = self.quantize_highbit(input.data, input.start_bits)
+            elif input.start_bits == 2:
+                assert self.q_type == 'symmetric', 'Quantization type is not symmetric!'
+                assert self.q_rounding == 'nearest', 'Quantization rounding is not nearest_neighbor!'
+                input_flat = self.quantize_tenary(input.data)
+            elif input.start_bits == 1:
+                assert self.q_type == 'symmetric', 'Quantization type is not symmetric!'
+                assert self.q_rounding == 'nearest', 'Quantization rounding is not nearest_neighbor!'
+                input_flat = self.quantize_binary(input.data)
+        if self.use_quantizer_kernel:
+            return self.mixed_fp16_quantize(input.data, input_q, index)
+        else:
+            if self.q_mixed_fp16 and input.start_bits >= input.target_bits - 1:
+                input_flat = self.quantize_real_ratio * input.data + \
+                              (1 - self.quantize_real_ratio) * input_flat
+            return input_flat
 
     def update_fp16_ratio(self):
         if self.q_mixed_fp16:
diff --git a/deepspeed/runtime/state_dict_factory.py b/deepspeed/runtime/state_dict_factory.py
old mode 100644
new mode 100755
index 09887aaa275ce4060489aefb46be7ec4182dee09..1f5c97f27bd379eeb970a74ea457c67fbdaf3f3c
--- a/deepspeed/runtime/state_dict_factory.py
+++ b/deepspeed/runtime/state_dict_factory.py
@@ -8,7 +8,10 @@ import copy
 import collections
 import json
 from abc import ABC, abstractmethod
+
 from deepspeed.utils import logger
+from deepspeed.runtime.checkpoint_engine.torch_checkpoint_engine import TorchCheckpointEngine
+
 from .weight_quantizer import WeightQuantization
 
 AUTO_MODULE_KEY = 'auto'
@@ -16,28 +19,41 @@ AUTO_MODULE_KEY = 'auto'
 
 class SDLoaderFactory:
     @staticmethod
-    def get_sd_loader_json(json_file):
-        with open(json_file) as f:
-            data = json.load(f)
-            sd_type = data['type']
-            ckpt_list = data['checkpoints']
-            version = data['version']
-            return SDLoaderFactory.get_sd_loader(ckpt_list, sd_type, version)
+    def get_sd_loader_json(json_file, checkpoint_engine):
+        if isinstance(json_file, str):
+            with open(json_file) as f:
+                data = json.load(f)
+        else:
+            assert isinstance(json_file, dict)
+            data = json_file
+        sd_type = data['type']
+        ckpt_list = data['checkpoints']
+        version = data['version']
+        ckpt_type = data.get('parallelization', 'pp')
+        mp_size = data.get('mp_size', 0)
+        if sd_type.lower() in ['bloom', 'ds_model']:
+            return data
+        return SDLoaderFactory.get_sd_loader(ckpt_list,
+                                             checkpoint_engine,
+                                             sd_type,
+                                             version)
 
     @staticmethod
-    def get_sd_loader(ckpt_list, sd_type='Megatron', version=None):
+    def get_sd_loader(ckpt_list, checkpoint_engine, sd_type='Megatron', version=None):
         if sd_type == 'Megatron':
-            return MegatronSDLoader(ckpt_list, version)
+            return MegatronSDLoader(ckpt_list, version, checkpoint_engine)
         else:
             assert False, '{} checkpoint type is not supported'.format(sd_type)
 
 
 class SDLoaderBase(ABC):
-    def __init__(self, ckpt_list, version):
+    def __init__(self, ckpt_list, version, checkpoint_engine):
         self.module_key = None
         self.ckpt_list = ckpt_list
-        self.check_ckpt_list()
         self.version = version
+        self.checkpoint_engine = TorchCheckpointEngine(
+        ) if checkpoint_engine is None else checkpoint_engine
+        self.check_ckpt_list()
 
     def load(self,
              mp_world_size,
@@ -79,7 +95,8 @@ class SDLoaderBase(ABC):
         if num_ckpt == mp_world_size:
             assert os.path.exists(load_path)
             #logger.info(f'rank: {mp_rank} loading checkpoint: {load_path}')
-            sd = torch.load(load_path, map_location=lambda storage, loc: storage)
+            sd = self.checkpoint_engine.load(load_path, map_location=lambda storage, \
+                loc: storage)
 
             if quantize:
                 quantizer = WeightQuantization(mlp_extra_grouping=mlp_extra_grouping,
@@ -108,9 +125,9 @@ class SDLoaderBase(ABC):
 
         logger.info(f"mp_rank: {mp_rank}, ckpt_list: {ckpt_list}")
         sd_list = [
-            torch.load(ckpt,
-                       map_location=lambda storage,
-                       loc: storage) for ckpt in ckpt_list
+            self.checkpoint_engine.load(ckpt,
+                                        map_location=lambda storage,
+                                        loc: storage) for ckpt in ckpt_list
         ]
         return sd_list
 
@@ -126,9 +143,9 @@ class SDLoaderBase(ABC):
             f"mp_rank: {mp_rank}, ckpt_list: {self.ckpt_list[ckpt_index]}, offset: {ckpt_offset}"
         )
 
-        sd = torch.load(self.ckpt_list[ckpt_index],
-                        map_location=lambda storage,
-                        loc: storage)
+        sd = self.checkpoint_engine.load(self.ckpt_list[ckpt_index],
+                                         map_location=lambda storage,
+                                         loc: storage)
 
         return sd, num_to_split, ckpt_offset
 
@@ -161,7 +178,9 @@ class SDLoaderBase(ABC):
         #logger.info(f'checkpoint file list: {self.ckpt_list}')
         assert len(self.ckpt_list) > 0
 
-        sd = torch.load(self.ckpt_list[0], map_location=lambda storage, loc: storage)
+        sd = self.checkpoint_engine.load(self.ckpt_list[0],
+                                         map_location=lambda storage,
+                                         loc: storage)
 
         # check checkpoint count is same with saved mp_world_size
         if 'mp_world_size' in sd.keys():
@@ -193,8 +212,8 @@ class SDLoaderBase(ABC):
 
 
 class MegatronSDLoader(SDLoaderBase):
-    def __init__(self, ckpt_list, version):
-        super().__init__(ckpt_list, version)
+    def __init__(self, ckpt_list, version, checkpoint_engine):
+        super().__init__(ckpt_list, version, checkpoint_engine)
         """
         ## Q/K/V data need special processing
         key: transformer.layers.0.attention.query_key_value.weight, shape: torch.Size([3192, 4256])
@@ -431,7 +450,9 @@ class MegatronSDLoader(SDLoaderBase):
             "mlp.dense_h_to_4h.bias"
         ]
 
-        sd = torch.load(ckpt_file_name, map_location=lambda storage, loc: storage)
+        sd = self.checkpoint_engine.load(ckpt_file_name,
+                                         map_location=lambda storage,
+                                         loc: storage)
 
         # partial_key is a sub-string of one key in the sd
         def check_key_exist(partial_key, sd):
diff --git a/deepspeed/runtime/swap_tensor/async_swapper.py b/deepspeed/runtime/swap_tensor/async_swapper.py
index e6e19a4c67efb913862f1646d43768bf0bb17169..45614abd794ed01a35fd9ee9e2b1d16dfedd39bc 100644
--- a/deepspeed/runtime/swap_tensor/async_swapper.py
+++ b/deepspeed/runtime/swap_tensor/async_swapper.py
@@ -6,6 +6,7 @@ Functionality of swapping tensors to/from (NVMe) storage devices.
 """
 import torch
 
+from deepspeed import comm as dist
 from deepspeed.utils.logging import logger
 from deepspeed.runtime.swap_tensor.utils import swap_out_tensors, SwapBuffer
 
@@ -66,10 +67,10 @@ class AsyncTensorSwapper(object):
             self._swap_out_tensor(tensor, swap_path)
 
     def _report_statistics(self, message):
-        if torch.distributed.get_rank() == 0:
+        if dist.get_rank() == 0:
             element_size = torch.tensor([], dtype=self.dtype).element_size()
             swapped_GB = (self.num_elements_swapped * element_size) / (1024**3)
-            logger.info(
+            logger.debug(
                 f'{message} num_elems = {self.num_elements_swapped}, {swapped_GB:5.2f} GB'
             )
 
diff --git a/deepspeed/runtime/swap_tensor/optimizer_utils.py b/deepspeed/runtime/swap_tensor/optimizer_utils.py
index a08af96f2a1254d5bdcafce07a6a5fabecf0c3ce..70b806c3a15f4545780388a91d4fa91b6f524546 100644
--- a/deepspeed/runtime/swap_tensor/optimizer_utils.py
+++ b/deepspeed/runtime/swap_tensor/optimizer_utils.py
@@ -8,11 +8,11 @@ Functionality of swapping tensors to/from (NVMe) storage devices.
 import os
 import torch
 
+from deepspeed import comm as dist
 from deepspeed.utils.logging import logger
-from deepspeed.runtime.zero.offload_constants import *
 from deepspeed.runtime.swap_tensor.constants import *
 from deepspeed.runtime.swap_tensor.utils import swap_in_tensors, swap_out_tensors, \
-    MIN_AIO_BYTES, AIO_ALIGNED_BYTES, get_sized_buffers, get_sized_buffer
+    MIN_AIO_BYTES, AIO_ALIGNED_BYTES, get_sized_buffers
 from deepspeed.runtime.swap_tensor.utils import SwapBufferManager, SwapBufferPool
 
 
@@ -133,7 +133,7 @@ class OptimizerSwapper(object):
         self.swap_element_size = torch.tensor([], dtype=dtype).element_size()
         self.swap_folder = os.path.join(base_folder,
                                         'optimizer',
-                                        f'rank{torch.distributed.get_rank()}')
+                                        f'rank{dist.get_rank()}')
         os.makedirs(self.swap_folder, exist_ok=True)
 
         self.optimizer = optimizer
@@ -146,10 +146,9 @@ class OptimizerSwapper(object):
         # Swap buffer management
         self.largest_numel = self._io_aligned_numel(largest_numel)
         self.dtype = dtype
-        self.swap_buffer_manager = SwapBufferManager(
-            num_elems=self.largest_numel,
-            count=swap_config[OFFLOAD_OPTIMIZER_BUFFER_COUNT],
-            dtype=dtype)
+        self.swap_buffer_manager = SwapBufferManager(num_elems=self.largest_numel,
+                                                     count=swap_config.buffer_count,
+                                                     dtype=dtype)
 
         # Timers
         self.timers = timers
@@ -271,7 +270,7 @@ class OptimizerSwapper(object):
                 fp16_partitions_info=fp16_partitions_info[curr_index:],
                 fp16_swap_buffers=fp16_swap_buffers)
 
-            if torch.distributed.get_rank() == 0 and SWAPPER_DEBUG_MODE:
+            if dist.get_rank() == 0 and SWAPPER_DEBUG_MODE:
                 for i, tensor in enumerate(fp16_pinned_tensors):
                     true_index = curr_index + i
                     logger.info(
@@ -376,7 +375,7 @@ class OptimizerSwapper(object):
                                         dest_paths=swap_paths,
                                         pinned_buffers=pinned_buffers)
 
-        if torch.distributed.get_rank() == 0 and SWAPPER_DEBUG_MODE:
+        if dist.get_rank() == 0 and SWAPPER_DEBUG_MODE:
             for i, tensor in enumerate(src_tensors):
                 logger.info(
                     f'copy_in_fp16_param: fp32_id = {id(parameters[i])} index = {i}, swap_num_elem = {src_tensors[i].numel()}'
diff --git a/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
index cbcb136aa337d3f7cdc97520090cde95cabc3038..515853529e0c797ac5603f95d1924526ee394753 100644
--- a/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
+++ b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
@@ -5,15 +5,15 @@ Licensed under the MIT license.
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
-import os
 import torch
 
 from deepspeed.utils.logging import logger
-from deepspeed.ops.aio import AsyncIOBuilder
+from deepspeed.ops.op_builder import AsyncIOBuilder
+from deepspeed import comm as dist
 
 from deepspeed.runtime.swap_tensor.constants import *
 from deepspeed.runtime.swap_tensor.utils import swap_in_tensors, swap_out_tensors, print_object, \
-    MIN_AIO_BYTES, AIO_ALIGNED_BYTES, get_sized_buffers, get_sized_buffer
+    get_sized_buffers
 from deepspeed.runtime.swap_tensor.async_swapper import AsyncTensorSwapper
 from deepspeed.runtime.swap_tensor.optimizer_utils import OptimizerSwapper
 
@@ -62,7 +62,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper):
             'print_exclude_list'
         ]
 
-        if torch.distributed.get_rank() == 0:
+        if dist.get_rank() == 0:
             print_object(obj=self,
                          name='PartitionedOptimizerSwapper',
                          exclude_list=self.print_exclude_list)
@@ -160,7 +160,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper):
 
         self._log_timers([WRITE_TIMER])
 
-        if DEBUG_MODE and torch.distributed.get_rank() == 0:
+        if DEBUG_MODE and dist.get_rank() == 0:
             logger.info(f'optimizer_param_swap_out: {(swap_bytes/(1024**3)):5.2f} GB')
 
     def swap_out_gradients(self, parameter, gradient_offsets, gradient_tensors):
@@ -200,7 +200,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper):
             t.data = buffer.data
 
         self._log_timers([READ_TIMER, WAIT_TIMER])
-        if DEBUG_MODE and torch.distributed.get_rank() == 0:
+        if DEBUG_MODE and dist.get_rank() == 0:
             logger.info(f'optimizer_param_swap_in: {(swap_bytes/(1024**3)):5.2f} GB')
 
     def _separate_pinned_tensors(self, swap_info):
diff --git a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
index 5128bbf6a59a68ad8afb43e5a9b9c1f57c28a390..e9e134e6779aea1511439df38010c8ec9f927d19 100644
--- a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
+++ b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
@@ -9,17 +9,15 @@ import os
 import shutil
 from enum import Enum
 import torch
-import torch.distributed as dist
-
-from deepspeed.utils.logging import logger
-from deepspeed.ops.aio import AsyncIOBuilder
+from deepspeed import comm as dist
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import AsyncIOBuilder
 from .constants import *
 from .utils import swap_in_tensors, swap_out_tensors, MIN_AIO_BYTES, AIO_ALIGNED_BYTES, print_object, SwapBufferPool
-from ..zero.offload_constants import *
 
 
 def print_rank_0(message, debug=False, force=False):
-    if torch.distributed.get_rank() == 0 and (debug or force):
+    if dist.get_rank() == 0 and (debug or force):
         print(message)
 
 
@@ -86,7 +84,7 @@ class AsyncPartitionedParameterSwapper(object):
     def _configure_aio(self, ds_config):
         self.swap_config = ds_config.zero_config.offload_param
         torch_dtype_string = str(self.dtype).split(".")[1]
-        self.swap_folder = os.path.join(self.swap_config[OFFLOAD_PARAM_NVME_PATH],
+        self.swap_folder = os.path.join(self.swap_config.nvme_path,
                                         'zero_stage_3',
                                         f'{torch_dtype_string}params',
                                         f'rank{dist.get_rank()}')
@@ -102,18 +100,17 @@ class AsyncPartitionedParameterSwapper(object):
         self.aligned_bytes = AIO_ALIGNED_BYTES * self.aio_config[AIO_THREAD_COUNT]
         self.numel_alignment = self.aligned_bytes // self.swap_element_size
 
-        self.elements_per_buffer = self.swap_config[OFFLOAD_PARAM_BUFFER_SIZE]
+        self.elements_per_buffer = self.swap_config.buffer_size
         self.aligned_elements_per_buffer = self._io_aligned_numel(
             self.elements_per_buffer)
-        self.param_buffer_count = self.swap_config[OFFLOAD_PARAM_BUFFER_COUNT]
+        self.param_buffer_count = self.swap_config.buffer_count
 
         self.available_buffer_ids = [i for i in range(self.param_buffer_count)]
         self.reserved_buffer_ids = []
-        self.buffers = torch.empty(int(self.aligned_elements_per_buffer *
-                                       self.param_buffer_count),
-                                   dtype=self.dtype,
-                                   pin_memory=True,
-                                   requires_grad=False)
+        self.buffers = get_accelerator().pin_memory(
+            torch.empty(int(self.aligned_elements_per_buffer * self.param_buffer_count),
+                        dtype=self.dtype,
+                        requires_grad=False))
 
         self.aio_read_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE],
                                                self.aio_config[AIO_QUEUE_DEPTH],
@@ -305,7 +302,7 @@ class AsyncPartitionedParameterSwapper(object):
                     f'Num inflight: params {len(self.inflight_params)}, buffers {len(self.inflight_swap_in_buffers)}, numel = {self.inflight_numel}',
                     force=True)
                 print_rank_0(
-                    f'Num available: param {len(self.available_params)}, numel = {self.available_numel}',
+                    f'Num available params: count = {len(self.available_params)}, ids = {self.available_params}, numel = {self.available_numel}',
                     force=True)
 
             assert len(swap_in_paths) <= len(self.available_buffer_ids), f"Not enough buffers {len(self.available_buffer_ids)} for swapping {len(swap_in_paths)}"
@@ -395,9 +392,10 @@ class AsyncPartitionedParameterSwapper(object):
     def reserve_partitioned_swap_space(self, partition_num_elems):
         aligned_numel = sum(
             [self._io_aligned_numel(numel) for numel in partition_num_elems])
-        self.partitioned_swap_buffer = torch.zeros(aligned_numel,
-                                                   device='cpu',
-                                                   dtype=self.dtype).pin_memory()
+        self.partitioned_swap_buffer = get_accelerator().pin_memory(
+            torch.zeros(aligned_numel,
+                        device='cpu',
+                        dtype=self.dtype))
         self.partitioned_swap_pool = SwapBufferPool([self.partitioned_swap_buffer])
 
     def swap_out_partitioned_params(self, dst_fp16_params, src_fp32_params):
diff --git a/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
index 7d0116faab5b51724fd24de4420802c83ca1cfce..4e101528cd231224d5431cc3e43be4900d293770 100644
--- a/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
+++ b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
@@ -5,18 +5,13 @@ Licensed under the MIT license.
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
-import os
-import torch
+from deepspeed.ops.op_builder import AsyncIOBuilder
+from deepspeed import comm as dist
 
-from deepspeed.utils.logging import logger
-from deepspeed.ops.aio import AsyncIOBuilder
-
-from deepspeed.runtime.zero.offload_constants import *
 from deepspeed.runtime.swap_tensor.constants import *
-from deepspeed.runtime.swap_tensor.utils import swap_in_tensors, swap_out_tensors, print_object, \
-    MIN_AIO_BYTES, AIO_ALIGNED_BYTES
+from deepspeed.runtime.swap_tensor.utils import swap_in_tensors, swap_out_tensors, print_object
 from deepspeed.runtime.swap_tensor.async_swapper import AsyncTensorSwapper
-from deepspeed.runtime.swap_tensor.optimizer_utils import SwapBufferManager, get_sized_buffer
+from deepspeed.runtime.swap_tensor.utils import get_sized_buffer
 from deepspeed.runtime.swap_tensor.optimizer_utils import OptimizerSwapper
 
 
@@ -95,8 +90,8 @@ class PipelinedOptimizerSwapper(OptimizerSwapper):
                                                    numel_alignment=self.numel_alignment,
                                                    timers=self.timers)
 
-        self.async_swap_in = swap_config[OFFLOAD_OPTIMIZER_PIPELINE_READ]
-        self.async_swap_out = swap_config[OFFLOAD_OPTIMIZER_PIPELINE_WRITE]
+        self.async_swap_in = swap_config.pipeline_read
+        self.async_swap_out = swap_config.pipeline_write
 
         self.swap_ops = {
             SYNC_SWAP_IN: None,
@@ -113,7 +108,7 @@ class PipelinedOptimizerSwapper(OptimizerSwapper):
             'print_exclude_list'
         ]
 
-        if torch.distributed.get_rank() == 0:
+        if dist.get_rank() == 0:
             print_object(obj=self,
                          name='PipelinedOptimizerSwapper',
                          exclude_list=self.print_exclude_list)
@@ -254,7 +249,7 @@ class PipelinedOptimizerSwapper(OptimizerSwapper):
             count=required_buffer_count,
             dtype=parameter.dtype)
         assert allocated_buffers is not None, \
-        f"PipelinedOptimizerSwapper ran out of swap buffers, try increasing {OFFLOAD_OPTIMIZER_BUFFER_COUNT}"
+        f"PipelinedOptimizerSwapper ran out of swap buffers, try increasing 'buffer_count'"
 
         state_buffers = allocated_buffers[:len(param_info.tensors)]
         param_info.set_swap_buffers(state_buffers)
diff --git a/deepspeed/runtime/swap_tensor/utils.py b/deepspeed/runtime/swap_tensor/utils.py
index 2a751e398000914c819dfd32cd988666e0918030..1d236bd7fcacbb3f6fc7d381475f42e2051ed058 100644
--- a/deepspeed/runtime/swap_tensor/utils.py
+++ b/deepspeed/runtime/swap_tensor/utils.py
@@ -5,12 +5,11 @@ Licensed under the MIT license.
 Functionality of swapping tensors to/from (NVMe) storage devices.
 """
 
-import os
 import torch
 from deepspeed.utils.logging import logger
+from deepspeed.accelerator import get_accelerator
 
-from deepspeed.runtime.swap_tensor.constants import AIO_BLOCK_SIZE, AIO_QUEUE_DEPTH, \
-    AIO_THREAD_COUNT, AIO_SINGLE_SUBMIT, AIO_OVERLAP_EVENTS
+from deepspeed import comm as dist
 
 MIN_AIO_BYTES = 1024**2
 AIO_ALIGNED_BYTES = 1024
@@ -181,16 +180,17 @@ class SwapBufferManager(object):
         self.count = count
         self.dtype = dtype
         self.all_buffers = [
-            torch.zeros(num_elems,
-                        device='cpu',
-                        dtype=dtype).pin_memory() for _ in range(count)
+            get_accelerator().pin_memory(
+                torch.zeros(num_elems,
+                            device='cpu',
+                            dtype=dtype)) for _ in range(count)
         ]
         self.free_buffer_index = [i for i in range(count)]
         self.used_buffer_index = {}
         self.gigabytes = (self.all_buffers[0].element_size() * num_elems * count) / (1024
                                                                                      **3)
 
-        if torch.distributed.get_rank() == 0:
+        if dist.get_rank() == 0:
             exclude_list = ['all_buffers']
             print_object(obj=self, name='SwapBufferManager', exclude_list=exclude_list)
 
diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
old mode 100644
new mode 100755
index ff6daba1c6c9b60ebe486ce7e7bf27391c6ff74c..30dad84b16d1eb2b789fad5f539d908397e6e1c1
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -7,31 +7,29 @@ Helper functions and classes from multiple sources.
 '''
 
 from collections.abc import Iterable
-from deepspeed.moe.utils import is_moe_param, split_params_into_shared_and_expert_params
+from deepspeed.moe.utils import is_moe_param
 import os
 import psutil
 import gc
-from math import ceil, sqrt
+from math import sqrt
 from math import floor
-from bisect import bisect_left, bisect_right
+from bisect import bisect_left
 
 import torch
-from torch._six import inf
-import torch.distributed as dist
+from deepspeed import comm as dist
+
+try:
+    from torch._six import inf as inf
+except ModuleNotFoundError:
+    from torch import inf as inf
 
 from deepspeed.utils import groups, logger
 from deepspeed.runtime.constants import PIPE_REPLICATED
 from numpy import prod
+from deepspeed.accelerator import get_accelerator
 
-# pt-1.9 deprecations
-if hasattr(torch.cuda, "memory_reserved"):
-    torch_memory_reserved = torch.cuda.memory_reserved
-else:
-    torch_memory_reserved = torch.cuda.memory_allocated
-if hasattr(torch.cuda, "max_memory_reserved"):
-    torch_max_memory_reserved = torch.cuda.max_memory_reserved
-else:
-    torch_max_memory_reserved = torch.cuda.memory_cached
+torch_memory_reserved = get_accelerator().memory_reserved
+torch_max_memory_reserved = get_accelerator().max_memory_reserved
 
 
 class DummyOptim():
@@ -191,7 +189,7 @@ class CheckOverflow(object):
     def check_using_norm(self, norm_group, reduce_overflow=True):
         # TODO: I don't think reduce_overflow is needed if mpu is None
         overflow = -1 in norm_group
-        overflow_gpu = torch.cuda.FloatTensor([overflow])
+        overflow_gpu = get_accelerator().FloatTensor([overflow])
         if self.has_moe_params:
             # In this case, we need to do an all_reduce across
             # the expert_parallel_group, so that if there was
@@ -202,11 +200,11 @@ class CheckOverflow(object):
                             op=dist.ReduceOp.MAX,
                             group=groups._get_max_expert_parallel_group())
         if self.mpu is not None:
-            torch.distributed.all_reduce(overflow_gpu,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=self.mpu.get_model_parallel_group())
+            dist.all_reduce(overflow_gpu,
+                            op=dist.ReduceOp.MAX,
+                            group=self.mpu.get_model_parallel_group())
         elif reduce_overflow:
-            dist.all_reduce(overflow_gpu, op=torch.distributed.ReduceOp.MAX)
+            dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX)
             dist.barrier()
         overflow = overflow_gpu[0].item()
         return bool(overflow)
@@ -242,9 +240,9 @@ class CheckOverflow(object):
         overflow = self.has_overflow_serial(params)
         # Since each model parallel GPU carries only part of the model,
         # make sure overflow flag is synced across all the model parallel GPUs
-        overflow_gpu = torch.cuda.ByteTensor([overflow])
-        # torch.distributed.all_reduce(overflow_gpu,
-        #                             op=torch.distributed.ReduceOp.MAX,
+        overflow_gpu = get_accelerator().ByteTensor([overflow])
+        # deepspeeed.comm.all_reduce(overflow_gpu,
+        #                             op=deepspeed.comm.ReduceOp.MAX,
         #                             group=mpu.get_model_parallel_group())
         if has_moe_params:
             # All reduce this across expert_parallel_group, so that if an expert
@@ -253,9 +251,9 @@ class CheckOverflow(object):
                             op=dist.ReduceOp.MAX,
                             group=groups._get_max_expert_parallel_group())
         if self.zero_reduce_scatter:
-            torch.distributed.all_reduce(overflow_gpu,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=torch.distributed.group.WORLD)
+            dist.all_reduce(overflow_gpu,
+                            op=dist.ReduceOp.MAX,
+                            group=dist.get_world_group())
         elif self.mpu is not None:
             if self.deepspeed is not None:
                 using_pipeline = hasattr(self.deepspeed,
@@ -264,17 +262,16 @@ class CheckOverflow(object):
                         and self.deepspeed.pipeline_enable_backward_allreduce is False
                     ) or (not using_pipeline
                           and self.deepspeed.enable_backward_allreduce is False):
-                    torch.distributed.all_reduce(
-                        overflow_gpu,
-                        op=torch.distributed.ReduceOp.MAX,
-                        group=self.mpu.get_data_parallel_group())
-            torch.distributed.all_reduce(overflow_gpu,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=self.mpu.get_model_parallel_group())
+                    dist.all_reduce(overflow_gpu,
+                                    op=dist.ReduceOp.MAX,
+                                    group=self.mpu.get_data_parallel_group())
+            dist.all_reduce(overflow_gpu,
+                            op=dist.ReduceOp.MAX,
+                            group=self.mpu.get_model_parallel_group())
         elif self.deepspeed is not None and self.deepspeed.enable_backward_allreduce is False:
-            torch.distributed.all_reduce(overflow_gpu,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=torch.distributed.group.WORLD)
+            dist.all_reduce(overflow_gpu,
+                            op=dist.ReduceOp.MAX,
+                            group=dist.get_world_group())
 
         overflow = overflow_gpu[0].item()
         return bool(overflow)
@@ -304,7 +301,7 @@ class CheckOverflow(object):
 
 def _handle_overflow(cpu_sum, x, i):
     import math
-    rank = torch.distributed.get_rank()
+    rank = dist.get_rank()
     if rank == 0:
         t_i = -1
         for v_i, v in enumerate(x.data.contiguous().view(-1)):
@@ -353,12 +350,12 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2, mpu=None):
     norm_type = float(norm_type)
     if norm_type == inf:
         total_norm = max(p.grad.data.abs().max() for p in parameters)
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         # Take max across all GPUs.
         if mpu is not None:
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda,
+                            op=dist.ReduceOp.MAX,
+                            group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()
     else:
         total_norm = 0
@@ -373,18 +370,18 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2, mpu=None):
                 total_norm += param_norm.item()**norm_type
 
         # Sum across all model parallel GPUs.
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         if mpu is not None:
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.SUM,
-                                         group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda,
+                            op=dist.ReduceOp.SUM,
+                            group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
     # Need to average total_norm across different GPUs due to the presence of moe params
     pg = groups._get_data_parallel_group()
     scaled_norm = total_norm * 1.0 / float(dist.get_world_size(group=pg))
 
-    scaled_norm_tensor = torch.cuda.FloatTensor([float(scaled_norm)])
+    scaled_norm_tensor = get_accelerator().FloatTensor([float(scaled_norm)])
     dist.all_reduce(scaled_norm_tensor, group=pg)
     total_norm = scaled_norm_tensor.item()
 
@@ -419,12 +416,12 @@ def get_grad_norm(parameters, norm_type=2, mpu=None):
     norm_type = float(norm_type)
     if norm_type == inf:
         total_norm = max(p.grad.data.abs().max() for p in parameters)
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         # Take max across all GPUs.
         if mpu is not None:
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda,
+                            op=dist.ReduceOp.MAX,
+                            group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()
     else:
         total_norm = 0.
@@ -443,11 +440,11 @@ def get_grad_norm(parameters, norm_type=2, mpu=None):
             total_norm += param_norm.item()**norm_type
 
         # Sum across all model parallel GPUs.
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         if mpu is not None:
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.SUM,
-                                         group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda,
+                            op=dist.ReduceOp.SUM,
+                            group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
     if total_norm == float(
@@ -489,11 +486,11 @@ def get_grad_zeros(parameters, mpu=None):
         total_zeros += count_zeros.item()
 
     # Sum across all model parallel GPUs.
-    total_zeros_cuda = torch.cuda.FloatTensor([float(total_zeros)])
+    total_zeros_cuda = get_accelerator().FloatTensor([float(total_zeros)])
     if mpu is not None:
-        torch.distributed.all_reduce(total_zeros_cuda,
-                                     op=torch.distributed.ReduceOp.SUM,
-                                     group=mpu.get_model_parallel_group())
+        dist.all_reduce(total_zeros_cuda,
+                        op=dist.ReduceOp.SUM,
+                        group=mpu.get_model_parallel_group())
     total_zeros = total_zeros_cuda[0].item()
 
     return total_zeros
@@ -522,12 +519,12 @@ def get_weight_norm(parameters, norm_type=2, mpu=None):
     norm_type = float(norm_type)
     if norm_type == inf:
         total_norm = max(p.data.abs().max() for p in parameters)
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         # Take max across all GPUs.
         if mpu is not None:
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda,
+                            op=dist.ReduceOp.MAX,
+                            group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()
     else:
         total_norm = 0.
@@ -546,11 +543,11 @@ def get_weight_norm(parameters, norm_type=2, mpu=None):
             total_norm += param_norm**norm_type
 
         # Sum across all model parallel GPUs.
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         if mpu is not None:
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.SUM,
-                                         group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda,
+                            op=dist.ReduceOp.SUM,
+                            group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
     if total_norm == float(
@@ -670,7 +667,7 @@ class PartitionedTensor:
         self.local_data, self.partition = self._partition_tensor(tensor)
 
     @classmethod
-    def from_meta(cls, meta, local_part, group, device='cuda'):
+    def from_meta(cls, meta, local_part, group, device=get_accelerator().device_name()):
         assert meta.dtype == torch.long
         dummy = torch.ones(dist.get_world_size(group=group))
         part_obj = cls(tensor=dummy, group=group)
@@ -774,14 +771,14 @@ def memory_status(msg, print_rank=-1, reset_max=False):
     if print_rank != -1 and rank != print_rank:
         return
 
-    torch.cuda.synchronize()
+    get_accelerator().synchronize()
 
     if reset_max:
-        torch.cuda.reset_max_memory_cached()
-        torch.cuda.reset_max_memory_allocated()
+        get_accelerator().reset_max_memory_cached()
+        get_accelerator().reset_max_memory_allocated()
 
-    new_alloced = torch.cuda.memory_allocated()
-    new_cached = torch.cuda.memory_cached()
+    new_alloced = get_accelerator().memory_allocated()
+    new_cached = get_accelerator().memory_cached()
 
     delta_alloced = new_alloced - mem_alloced
     delta_cached = new_cached - mem_cached
@@ -789,8 +786,8 @@ def memory_status(msg, print_rank=-1, reset_max=False):
     mem_cached = new_cached
     mem_alloced = new_alloced
 
-    max_alloced = torch.cuda.max_memory_allocated()
-    max_cached = torch.cuda.max_memory_cached()
+    max_alloced = get_accelerator().max_memory_allocated()
+    max_cached = get_accelerator().max_memory_cached()
 
     # convert to GB for printing
     new_alloced /= 1024**3
@@ -803,22 +800,26 @@ def memory_status(msg, print_rank=-1, reset_max=False):
     print(
         f'RANK={rank} MEMSTATS',
         msg,
-        f'device={torch.cuda.current_device()} '
+        f'device={get_accelerator().current_device_name()} '
         f'current alloc={new_alloced:0.4f}GB (delta={delta_alloced:0.4f}GB max={max_alloced:0.4f}GB) '
         f'current cache={new_cached:0.4f}GB (delta={delta_cached:0.4f}GB max={max_cached:0.4f}GB)'
     )
 
 
 def get_ma_status():
-    if torch.distributed.is_initialized() and not torch.distributed.get_rank() == 0:
+    if dist.is_initialized() and not dist.get_rank() == 0:
         return 0
-    return torch.cuda.memory_allocated()
+    return get_accelerator().memory_allocated()
+
+
+def empty_cache():
+    get_accelerator().empty_cache()
 
 
 def see_memory_usage(message, force=False):
     if not force:
         return
-    if torch.distributed.is_initialized() and not torch.distributed.get_rank() == 0:
+    if dist.is_initialized() and not dist.get_rank() == 0:
         return
 
     # python doesn't do real-time garbage collection so do it explicitly to get the correct RAM reports
@@ -827,8 +828,8 @@ def see_memory_usage(message, force=False):
     # Print message except when distributed but not rank 0
     logger.info(message)
     logger.info(
-        f"MA {round(torch.cuda.memory_allocated() / (1024 * 1024 * 1024),2 )} GB \
-        Max_MA {round(torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),2)} GB \
+        f"MA {round(get_accelerator().memory_allocated() / (1024 * 1024 * 1024),2 )} GB \
+        Max_MA {round(get_accelerator().max_memory_allocated() / (1024 * 1024 * 1024),2)} GB \
         CA {round(torch_memory_reserved() / (1024 * 1024 * 1024),2)} GB \
         Max_CA {round(torch_max_memory_reserved() / (1024 * 1024 * 1024))} GB ")
 
@@ -838,8 +839,7 @@ def see_memory_usage(message, force=False):
         f'CPU Virtual Memory:  used = {used_GB} GB, percent = {vm_stats.percent}%')
 
     # get the peak memory to report correct data, so reset the counter for the next call
-    if hasattr(torch.cuda, "reset_peak_memory_stats"):  # pytorch 1.4+
-        torch.cuda.reset_peak_memory_stats()
+    get_accelerator().reset_peak_memory_stats()
 
 
 def call_to_str(base, *args, **kwargs):
@@ -913,20 +913,20 @@ def get_global_norm_of_tensors(input_tensors, norm_type=2, mpu=None):
     norm_type = float(norm_type)
     if norm_type == inf:
         total_norm = max(t.data.abs().max() for t in input_tensors)
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         if mpu is not None:
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda,
+                            op=dist.ReduceOp.MAX,
+                            group=mpu.get_model_parallel_group())
             total_norm = total_norm_cuda[0].item()
     else:
         total_norm = sum(
             [t.data.float().norm(norm_type).item()**norm_type for t in input_tensors])
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         if mpu is not None:
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.SUM,
-                                         group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda,
+                            op=dist.ReduceOp.SUM,
+                            group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
     if total_norm == float(
diff --git a/deepspeed/runtime/weight_quantizer.py b/deepspeed/runtime/weight_quantizer.py
index a6c241d19f58ba41596e579ce6cedaca92c6bb81..f88a9310ba32497867ab9d1e05e2c4485a01204f 100644
--- a/deepspeed/runtime/weight_quantizer.py
+++ b/deepspeed/runtime/weight_quantizer.py
@@ -1,5 +1,8 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import torch
 from ..module_inject.replace_policy import HFBertLayerPolicy, replace_policies
+from deepspeed.accelerator import get_accelerator
 
 
 class WeightQuantization(object):
@@ -44,9 +47,11 @@ class WeightQuantization(object):
             q_scale.append(data_scale)
             value_list[index] = data_int
             index += 1
-        q_scale = (1 / torch.cat(q_scale,
-                                 dim=merge_dim).to(
-                                     torch.cuda.current_device()).view(-1).unsqueeze(0))
+        q_scale = (
+            1 /
+            torch.cat(q_scale,
+                      dim=merge_dim).to(
+                          get_accelerator().current_device_name()).view(-1).unsqueeze(0))
         if "mlp.dense_4h_to_h.weight" in key:
             self.mlp4hh_scales.append(q_scale)
         elif "mlp.dense_h_to_4h.weight" in key:
@@ -63,7 +68,7 @@ class WeightQuantization(object):
             torch.cat((s,
                        torch.zeros((1,
                                     max_dim - s.shape[-1]),
-                                   device=torch.cuda.current_device())),
+                                   device=get_accelerator().current_device_name())),
                       dim=-1) if s.shape[-1] < max_dim else s for s in layer_scales
         ]
         return torch.cat(layer_scales).unsqueeze(0)
@@ -127,16 +132,15 @@ class WeightQuantization(object):
             layer_scales = []
 
             for key in range(len(keys)):
-                if self.mlp_extra_grouping and is_mlp(keys[key]):
+                if self.mlp_extra_grouping and self.is_mlp(keys[key]):
                     data_quantized, data_scale = self.quantize_data(keys[key], quantize_bits, groups * 2)
                 elif policy_cls is HFBertLayerPolicy and self.is_qkv(keys[key]):
                     data_quantized, data_scale = self.quantize_data(keys[key], quantize_bits, groups * 3)
                 else:
                     data_quantized, data_scale = self.quantize_data(keys[key], quantize_bits, groups)
                 keys[key].copy_(data_quantized)
-                layer_scales.append(
-                    (1 /
-                     data_scale.to(torch.cuda.current_device()).view(-1).unsqueeze(0)))
+                layer_scales.append((1 / data_scale.to(
+                    get_accelerator().current_device_name()).view(-1).unsqueeze(0)))
             all_scales.append(self.merge_layer_scales(layer_scales))
             return layer
 
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index 3804fb50a3715166440c7d9bbd4c73ba5d0d6a42..d7ab552188b80d924ae96e597c197c77b4cdb49e 100644
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -1,197 +1,277 @@
+'''Copyright The Microsoft DeepSpeed Team'''
 """
 Copyright (c) Microsoft Corporation
 Licensed under the MIT license.
 """
 
-from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
+from pydantic import Field, validator
+import sys
+from typing import Optional
+from enum import Enum
+from deepspeed.runtime.config_utils import get_scalar_param, pp_int, DeepSpeedConfigModel
 from deepspeed.utils import logger
-from .constants import *
-from .offload_constants import *
-from .offload_config import get_offload_param_config, get_default_offload_param_config, \
-    get_offload_optimizer_config, get_default_offload_optimizer_config
-
-
-class DeepSpeedZeroConfig(DeepSpeedConfigObject):
-    def __init__(self, param_dict):
-        super(DeepSpeedZeroConfig, self).__init__()
-
-        self.stage = None
-        self.contiguous_gradients = None
-        self.reduce_scatter = None
-        self.reduce_bucket_size = None
-        self.allgather_partitions = None
-        self.allgather_bucket_size = None
-        self.overlap_comm = None
-        self.load_from_fp32_weights = None
-
-        self.elastic_checkpoint = None
-
-        #Offload Specific Parameters
-        self.offload_param = None
-        self.offload_optimizer = None
-        self.sub_group_size = None
-
-        #Stage3 Specific Parameters
-        self.prefetch_bucket_size = None
-        self.param_persistence_threshold = None
-        self.max_live_parameters = None
-        self.max_reuse_distance = None
-        self.gather_16bit_weights_on_model_save = None
-
-        self.ignore_unused_parameters = None
-        self.round_robin_gradients = None
-
-        if ZERO_OPTIMIZATION in param_dict.keys():
-            zero_config_dict = param_dict[ZERO_OPTIMIZATION]
-            if type(zero_config_dict) is bool:
-                zero_config_dict = self.read_zero_config_deprecated(param_dict)
-        else:
-            zero_config_dict = ZERO_OPTIMIZATION_DEFAULT
-
-        self._initialize(zero_config_dict)
-
-    def read_zero_config_deprecated(self, param_dict):
+from .offload_config import DeepSpeedZeroOffloadParamConfig, DeepSpeedZeroOffloadOptimizerConfig, OffloadDeviceEnum
+
+# ZeRO optimization. By default, this optimization is not enabled.
+# Users have to configure the desired optimization (0 means disabled) in params.json as below example:
+ZERO_FORMAT = """
+ZeRO optimization should be enabled as:
+"session_params": {
+  "zero_optimization": {
+    "stage": [0|1|2],
+    "stage3_max_live_parameters" : 1000000000,
+    "stage3_max_reuse_distance" : 1000000000,
+    "allgather_partitions": [true|false],
+    "allgather_bucket_size": 500000000,
+    "reduce_scatter": [true|false],
+    "contiguous_gradients" : [true|false]
+    "overlap_comm": [true|false],
+    "reduce_bucket_size": 500000000,
+    "load_from_fp32_weights": [true|false],
+    "cpu_offload": [true|false] (deprecated),
+    "cpu_offload_params" : [true|false] (deprecated),
+    "cpu_offload_use_pin_memory": [true|false] (deprecated),
+    "sub_group_size" : 1000000000000,
+    "offload_param": {...},
+    "offload_optimizer": {...},
+    "ignore_unused_parameters": [true|false],
+    "round_robin_gradients": [true|false]
+    }
+}
+"""
+
+ZERO_OPTIMIZATION = "zero_optimization"
+
+
+def read_zero_config_deprecated(param_dict):
+    zero_config_dict = {}
+    zero_config_dict["stage"] = 1 if param_dict[ZERO_OPTIMIZATION] else 0
+    if zero_config_dict["stage"] > 0:
+        zero_config_dict["allgather_bucket_size"] = get_scalar_param(
+            param_dict,
+            "allgather_size",
+            5e8)
+    logger.warning(
+        "DeepSpeedConfig: this format of ZeRO optimization setup is deprecated. Please use the following format: {}"
+        .format(ZERO_FORMAT))
+    return zero_config_dict
+
+
+def get_zero_config(param_dict):
+    if ZERO_OPTIMIZATION in param_dict:
+        zero_config_dict = param_dict[ZERO_OPTIMIZATION]
+        if isinstance(zero_config_dict, bool):
+            zero_config_dict = read_zero_config_deprecated(param_dict)
+    else:
         zero_config_dict = {}
-        zero_config_dict[
-            ZERO_OPTIMIZATION_STAGE] = 1 if param_dict[ZERO_OPTIMIZATION] else 0
-        if zero_config_dict[ZERO_OPTIMIZATION_STAGE] > 0:
-            zero_config_dict[ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE] = get_scalar_param(
-                param_dict,
-                ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEPRECATED,
-                ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT)
-
-        logger.warning(
-            'DeepSpeedConfig: this format of ZeRO optimization setup is deprecated. Please use the following format: {}'
-            .format(ZERO_FORMAT))
-        return zero_config_dict
-
-    def _sanity_check(self, zero_config_dict):
-        deprecated_dict = dict(
-            ZERO_OPTIMIZATION_CPU_OFFLOAD=ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER,
-            ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS=ZERO_OPTIMIZATION_OFFLOAD_PARAM,
-            ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY=
-            f'{ZERO_OPTIMIZATION_OFFLOAD_PARAM} or {ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER}'
-        )
-
-        for old_key, new_key in deprecated_dict.items():
-            if old_key in zero_config_dict:
-                logger.warning(
-                    f'DeepSpeedConfig: {old_key} is deprecated. Please use {new_key}.')
-
-    def _initialize(self, zero_config_dict):
-        self._sanity_check(zero_config_dict)
-
-        self.stage = get_scalar_param(zero_config_dict,
-                                      ZERO_OPTIMIZATION_STAGE,
-                                      ZERO_OPTIMIZATION_STAGE_DEFAULT)
-
-        self.contiguous_gradients = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS,
-            ZERO3_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT
-            if self.stage == ZERO_OPTIMIZATION_WEIGHTS else
-            ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT)
-
-        self.reduce_bucket_size = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE,
-            ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE_DEFAULT)
-
-        self.reduce_scatter = get_scalar_param(zero_config_dict,
-                                               ZERO_OPTIMIZATION_REDUCE_SCATTER,
-                                               ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT)
-
-        self.overlap_comm = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_OVERLAP_COMM,
-            ZERO3_OPTIMIZATION_OVERLAP_COMM_DEFAULT if self.stage
-            == ZERO_OPTIMIZATION_WEIGHTS else ZERO_OPTIMIZATION_OVERLAP_COMM_DEFAULT)
-
-        self.allgather_partitions = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS,
-            ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT)
-
-        self.allgather_bucket_size = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE,
-            ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT)
-
-        self.load_from_fp32_weights = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS,
-            ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT)
-
-        self.elastic_checkpoint = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT,
-            ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT_DEFAULT)
-
-        if ZERO_OPTIMIZATION_CPU_OFFLOAD in zero_config_dict:
-            cpu_offload_optimizer = get_scalar_param(
-                zero_config_dict,
-                ZERO_OPTIMIZATION_CPU_OFFLOAD,
-                ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT)
-            if cpu_offload_optimizer:
-                self.offload_optimizer = get_default_offload_optimizer_config()
-        else:
-            self.offload_optimizer = get_offload_optimizer_config(zero_config_dict)
-
-        if ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS in zero_config_dict:
-            cpu_offload_params = get_scalar_param(
-                zero_config_dict,
-                ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS,
-                ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS_DEFAULT)
-            if cpu_offload_params:
-                self.offload_param = get_default_offload_param_config()
-        else:
-            self.offload_param = get_offload_param_config(zero_config_dict)
-
-        self.sub_group_size = get_scalar_param(zero_config_dict,
-                                               ZERO_OPTIMIZATION_SUB_GROUP_SIZE,
-                                               ZERO_OPTIMIZATION_SUB_GROUP_SIZE_DEFAULT)
-
-        self.max_live_parameters = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_MAX_LIVE_PARAMETERS,
-            ZERO_OPTIMIZATION_MAX_LIVE_PARAMETERS_DEFAULT)
-
-        self.max_reuse_distance = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_MAX_REUSE_DISTANCE,
-            ZERO_OPTIMIZATION_MAX_REUSE_DISTANCE_DEFAULT)
-
-        self.prefetch_bucket_size = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE,
-            ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE_DEFAULT)
-
-        self.param_persistence_threshold = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD,
-            ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT)
-
-        # config key has been renamed to use "16bit" instead of "fp16." falling back
-        # to old config name in order to preserve backwards compatibility
-        self.gather_16bit_weights_on_model_save = ZERO_OPTIMIZATION_GATHER_16BIT_WEIGHTS_ON_MODEL_SAVE_DEFAULT
-        for key in [
-                ZERO_OPTIMIZATION_GATHER_16BIT_WEIGHTS_ON_MODEL_SAVE,
-                ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE
-        ]:
-            if key in zero_config_dict:
-                self.gather_16bit_weights_on_model_save = zero_config_dict[key]
-                break
-
-        self.ignore_unused_parameters = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_IGNORE_UNUSED_PARAMETERS,
-            ZERO_OPTIMIZATION_IGNORE_UNUSED_PARAMETERS_DEFAULT)
-
-        self.legacy_stage1 = get_scalar_param(zero_config_dict,
-                                              ZERO_OPTIMIZATION_LEGACY_STAGE1,
-                                              ZERO_OPTIMIZATION_LEGACY_STAGE1_DEFAULT)
-
-        self.round_robin_gradients = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_ROUND_ROBIN_GRADIENTS,
-            ZERO_OPTIMIZATION_ROUND_ROBIN_GRADIENTS_DEFAULT)
+    return DeepSpeedZeroConfig(**zero_config_dict)
+
+
+class ZeroStageEnum(int, Enum):
+    """ Enum class for possible zero stages """
+    disabled = 0
+    optimizer_states = 1
+    gradients = 2
+    weights = 3
+    max_stage = 3
+
+
+class DeepSpeedZeroConfig(DeepSpeedConfigModel):
+    """
+    Sets parameters for ZeRO optimizations.
+    """
+
+    stage: ZeroStageEnum = 0
+    """
+    Chooses different stages of ZeRO Optimizer. Stage 0, 1, 2, and 3 refer
+    to disabled, optimizer state partitioning, and optimizer+gradient state
+    partitioning, and optimizer+gradient+parameter partitioning, respectively.
+    """
+
+    contiguous_gradients: bool = True
+    """
+    Copies the gradients to a contiguous buffer as they are produced. Avoids
+    memory fragmentation during backward pass.
+    """
+
+    reduce_scatter: bool = True
+    """
+    Uses reduce or reduce scatter instead of allreduce to average gradients
+    """
+
+    reduce_bucket_size: int = Field(pp_int(5e8), ge=0)
+    """
+    Number of elements reduced/allreduced at a time. Limits the memory required
+    for the allgather for large model sizes
+    """
+
+    allgather_partitions: bool = True
+    """
+    Chooses between allgather collective or a series of broadcast collectives
+    to gather updated parameters from all the GPUs at the end of each step
+    """
+
+    allgather_bucket_size: int = Field(pp_int(5e8), ge=0)
+    """
+    Number of elements allgathered at a time. Limits the memory required for
+    the allgather for large model sizes
+    """
+
+    overlap_comm: bool = None  # None for dynamic default value (see validator `overlap_comm_valid` below)
+    """
+    Attempts to overlap the reduction of the gradients with backward computation
+    """
+
+    load_from_fp32_weights: bool = True
+    """
+    Boolean indicating whether to initialize fp32 master weights from fp32
+    copies in checkpoint (no precision loss) or from model's fp16 copies (with
+    precision loss). This can be used to initialize optimizer state even when
+    checkpoint is missing optimizer state.
+    """
+
+    elastic_checkpoint: bool = False
+    """
+    Enable loading checkpoint that was saved by job with different GPU count.
+    No longer supported.
+    """
+
+    offload_param: Optional[DeepSpeedZeroOffloadParamConfig] = None
+    """
+    Enable offloading of model parameters to CPU or NVMe. This frees up GPU
+    memory for larger models or batch sizes. Valid only with stage 3. Expects a
+    dictionary containing values for :any:`DeepSpeedZeroOffloadParamConfig`.
+    """
+
+    offload_optimizer: Optional[DeepSpeedZeroOffloadOptimizerConfig] = None
+    """
+    Enable offloading of optimizer state to CPU or NVMe, and optimizer
+    computation to CPU. This frees up GPU memory for larger models or batch
+    sizes. Valid for ZeRO stage 1, 2, 3. Expects a dictionary containing values
+    for :any:`DeepSpeedZeroOffloadOptimizerConfig`.
+    """
+
+    sub_group_size: int = Field(pp_int(1e9), ge=0)
+    """
+    Tile size for parameter processing to fit massive models (with trillions of
+    parameters). Used by ZeRO3-Offload and ZeRO-Infinity
+    """
+
+    cpu_offload_param: bool = Field(
+        None,
+        deprecated=True,
+        new_param="offload_param",
+        new_param_fn=(
+            lambda val: DeepSpeedZeroOffloadParamConfig(device=OffloadDeviceEnum.cpu)
+            if val else None),
+    )
+    """ Deprecated, please use ``offload_param`` """
+
+    cpu_offload_use_pin_memory: bool = Field(
+        None,
+        deprecated=True,
+        new_param="offload_param or offload_optimizer",
+        set_new_param=False,
+    )
+    """ Deprecated, please use ``offload_param`` or ``offload_optimizer`` """
+
+    cpu_offload: bool = Field(
+        None,
+        deprecated=True,
+        new_param="offload_optimizer",
+        new_param_fn=(
+            lambda val: DeepSpeedZeroOffloadOptimizerConfig(device=OffloadDeviceEnum.cpu)
+            if val else None),
+    )
+    """ Deprecated, please use ``offload_optimizer`` """
+
+    prefetch_bucket_size: int = Field(pp_int(5e7),
+                                      ge=0,
+                                      alias="stage3_prefetch_bucket_size")
+    """
+    Maximum number of parameter elements to fetch ahead of use. Used by ZeRO3,
+    ZeRO3-Offload, ZeRO-Infinity, and ZeRO-Inference.
+    """
+
+    param_persistence_threshold: int = Field(pp_int(1e5),
+                                             ge=0,
+                                             alias="stage3_param_persistence_threshold")
+    """
+    Do not partition parameters smaller than this threshold. Smaller values use
+    less memory, but can greatly increase communication (especially
+    latency-bound messages).
+    """
+
+    model_persistence_threshold: int = Field(pp_int(sys.maxsize,
+                                                    "sys.maxsize"),
+                                             ge=0,
+                                             alias="stage3_model_persistence_threshold")
+    """
+    Maximum number of parameter elements that can be persisted in GPU and not
+    partitioned. This imposes an upper bound on the number of unpartitioned
+    parameters resulting from param_persistence_threshold setting. Used by
+    ZeRO3-Offload, ZeRO-Infinity and ZeRO-Inference.
+    """
+
+    max_live_parameters: int = Field(pp_int(1e9),
+                                     ge=0,
+                                     alias="stage3_max_live_parameters")
+    """
+    The maximum number of parameters resident per GPU before releasing. Smaller
+    values use less memory, but perform more communication.
+    """
+
+    max_reuse_distance: int = Field(pp_int(1e9), ge=0, alias="stage3_max_reuse_distance")
+    """
+    Do not release a parameter if it will be reused within this threshold of
+    parameters. Smaller values use less memory, but perform more communication.
+    """
+
+    gather_16bit_weights_on_model_save: bool = Field(
+        False,
+        alias="stage3_gather_16bit_weights_on_model_save")
+    """
+    Consolidate the weights before saving the model by ``save_16bit_model()``.
+    Since the weights are partitioned across GPUs, they aren’t part of
+    ``state_dict``, so this function automatically gathers the weights when
+    this option is enabled and then saves the fp16 model weights.
+    """
+
+    stage3_gather_fp16_weights_on_model_save: bool = Field(
+        False,
+        deprecated=True,
+        new_param="gather_16bit_weights_on_model_save")
+    """ Deprecated, please use ``gather_16bit_weights_on_model_save`` """
+
+    ignore_unused_parameters: bool = True
+    """
+    Unused parameters in modules may be unexpected in static networks, but
+    could be normal in dynamic networks. This controls whether or not training
+    should terminate with an error message when unused parameters are detected.
+    This is set to ``False`` by default, which means unused parameters are
+    ignored and training continues. Now is just used in stage 2.
+    """
+
+    legacy_stage1: bool = False
+    """
+    For backward-compatibility enable old ZeRO stage 1 implementation. Use at
+    your own risk, will be deprecated soon.
+    """
+
+    round_robin_gradients: bool = False
+    """
+    Stage 1 and 2 optimization for CPU offloading that parallelizes gradient
+    copying to CPU memory among ranks by fine-grained gradient partitioning.
+    Performance benefit grows with gradient accumulation steps (more copying
+    between optimizer steps) or GPU count (increased parallelism).
+    """
+
+    # Validators
+    @validator("overlap_comm")
+    def overlap_comm_valid(cls, field_value, values):
+        if field_value is None:
+            assert (
+                "stage" in values
+            ), "DeepSpeedZeroConfig: 'stage' must be defined before 'overlap_comm'"
+            field_value = values["stage"] == ZeroStageEnum.weights
+        return field_value
diff --git a/deepspeed/runtime/zero/contiguous_memory_allocator.py b/deepspeed/runtime/zero/contiguous_memory_allocator.py
index a5b14ae13342c64ec982f5b2c039a40befe6ac44..46e55787e6642d330019cb1ac52d26d912f0d148 100644
--- a/deepspeed/runtime/zero/contiguous_memory_allocator.py
+++ b/deepspeed/runtime/zero/contiguous_memory_allocator.py
@@ -1,8 +1,12 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import torch
 
+from deepspeed import comm as dist
+
 
 def print_rank_0(message):
-    if torch.distributed.get_rank() == 0:
+    if dist.get_rank() == 0:
         print(message)
 
 
diff --git a/deepspeed/runtime/zero/linear.py b/deepspeed/runtime/zero/linear.py
index 458f4ff392be067c4c98ec1ac91f98204406030f..6fbcabb1675a8a2d88e1d6dde7d08e8ce56585eb 100644
--- a/deepspeed/runtime/zero/linear.py
+++ b/deepspeed/runtime/zero/linear.py
@@ -1,3 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 #Linear Module to use with ZeRO Stage 3 to allow for parameter memory release
 #after the module execution during forward
 #Instead of saving variables using save_for_backward, we save variable ids
@@ -18,19 +20,26 @@ from torch.nn.parameter import Parameter
 from torch.nn import init
 from torch.nn.modules.module import Module
 from deepspeed.runtime.utils import noop_decorator
+from deepspeed import comm as dist
+from deepspeed.accelerator import get_accelerator
 
 tensor_map = {}
 
 
 def print_rank_0(message, debug=False, force=False):
-    if torch.distributed.get_rank() == 0 and (debug or force):
+    if dist.get_rank() == 0 and (debug or force):
         print(message)
 
 
-try:
-    autocast_custom_fwd = torch.cuda.amp.custom_fwd
-    autocast_custom_bwd = torch.cuda.amp.custom_bwd
-except (ImportError, AttributeError) as exp:
+device = get_accelerator().device_name()
+if device == 'cuda':
+    try:
+        autocast_custom_fwd = torch.cuda.amp.custom_fwd
+        autocast_custom_bwd = torch.cuda.amp.custom_bwd
+    except (ImportError, AttributeError) as exp:
+        autocast_custom_fwd = noop_decorator
+        autocast_custom_bwd = noop_decorator
+else:
     autocast_custom_fwd = noop_decorator
     autocast_custom_bwd = noop_decorator
 
@@ -110,6 +119,13 @@ class LinearFunctionForZeroStage3(torch.autograd.Function):
         return grad_input, grad_weight, grad_bias
 
 
+def zero3_linear_wrap(input, weight, bias=None):
+    if bias is None:
+        return LinearFunctionForZeroStage3.apply(input, weight)
+    else:
+        return LinearFunctionForZeroStage3.apply(input, weight, bias)
+
+
 class LinearModuleForZeroStage3(Module):
     r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b`.
     The weights are pre-transposed and stored as A^T instead of transposing during each
diff --git a/deepspeed/runtime/zero/offload_config.py b/deepspeed/runtime/zero/offload_config.py
index c438a7638a763c497be322509382cdccb91c3386..7ea76c68178d06cd0e906f284aae3893d9208559 100644
--- a/deepspeed/runtime/zero/offload_config.py
+++ b/deepspeed/runtime/zero/offload_config.py
@@ -1,75 +1,94 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team.
+'''Copyright The Microsoft DeepSpeed Team'''
+"""
+Copyright (c) Microsoft Corporation
 Licensed under the MIT license.
-'''
-
-from deepspeed.runtime.config_utils import get_scalar_param
-from .offload_constants import *
-from .utils import logger
-
-OFFLOAD_PARAM_KEY_DEFAULT_DICT = {
-    OFFLOAD_PARAM_DEVICE: OFFLOAD_PARAM_DEVICE_DEFAULT,
-    OFFLOAD_PARAM_NVME_PATH: OFFLOAD_PARAM_NVME_PATH_DEFAULT,
-    OFFLOAD_PARAM_BUFFER_COUNT: OFFLOAD_PARAM_BUFFER_COUNT_DEFAULT,
-    OFFLOAD_PARAM_BUFFER_SIZE: OFFLOAD_PARAM_BUFFER_SIZE_DEFAULT,
-    OFFLOAD_PARAM_MAX_IN_CPU: OFFLOAD_PARAM_MAX_IN_CPU_DEFAULT,
-    OFFLOAD_PARAM_PIN_MEMORY: OFFLOAD_PARAM_PIN_MEMORY_DEFAULT
-}
-
-OFFLOAD_OPTIMIZER_KEY_DEFAULT_DICT = {
-    OFFLOAD_OPTIMIZER_DEVICE: OFFLOAD_OPTIMIZER_DEVICE_DEFAULT,
-    OFFLOAD_OPTIMIZER_NVME_PATH: OFFLOAD_OPTIMIZER_NVME_PATH_DEFAULT,
-    OFFLOAD_OPTIMIZER_BUFFER_COUNT: OFFLOAD_OPTIMIZER_BUFFER_COUNT_DEFAULT,
-    OFFLOAD_OPTIMIZER_PIN_MEMORY: OFFLOAD_OPTIMIZER_PIN_MEMORY_DEFAULT,
-    OFFLOAD_OPTIMIZER_PIPELINE_READ: OFFLOAD_OPTIMIZER_PIPELINE_READ_DEFAULT,
-    OFFLOAD_OPTIMIZER_PIPELINE_WRITE: OFFLOAD_OPTIMIZER_PIPELINE_WRITE_DEFAULT,
-    OFFLOAD_OPTIMIZER_FAST_INIT: OFFLOAD_OPTIMIZER_FAST_INIT_DEFAULT
-}
-
-
-def _get_offload_config(param_dict, key_default_dict):
-    offload_config = {}
-    for key, default_value in key_default_dict.items():
-        offload_config[key] = get_scalar_param(param_dict, key, default_value)
-
-    return offload_config
-
-
-def get_offload_param_config(param_dict):
-    if OFFLOAD_PARAM in param_dict and param_dict[OFFLOAD_PARAM] is not None:
-        offload_config = _get_offload_config(
-            param_dict=param_dict[OFFLOAD_PARAM],
-            key_default_dict=OFFLOAD_PARAM_KEY_DEFAULT_DICT)
-        device = offload_config.get("device", OFFLOAD_PARAM_DEVICE_DEFAULT)
-        assert device in VALID_OFFLOAD_DEVICES, f'Invalid parameter offloading device specified: {device}.'
-        if device == OFFLOAD_NONE_DEVICE:
-            return None
-        return offload_config
-    return None
-
-
-def get_default_offload_param_config():
-    return OFFLOAD_PARAM_KEY_DEFAULT_DICT
-
-
-def get_offload_optimizer_config(param_dict):
-    if OFFLOAD_OPTIMIZER in param_dict and param_dict[OFFLOAD_OPTIMIZER] is not None:
-        offload_config = _get_offload_config(
-            param_dict=param_dict[OFFLOAD_OPTIMIZER],
-            key_default_dict=OFFLOAD_OPTIMIZER_KEY_DEFAULT_DICT)
-
-        device = offload_config.get("device", OFFLOAD_OPTIMIZER_DEVICE_DEFAULT)
-        assert device in VALID_OFFLOAD_DEVICES, f'Invalid optimizer offloading device specified: {device}.'
-        if device == OFFLOAD_NONE_DEVICE:
-            return None
-
-        offload_config[OFFLOAD_OPTIMIZER_PIPELINE] = offload_config[
-            OFFLOAD_OPTIMIZER_PIPELINE_READ] or offload_config[
-                OFFLOAD_OPTIMIZER_PIPELINE_WRITE]
-        return offload_config
-
-    return None
-
-
-def get_default_offload_optimizer_config():
-    return OFFLOAD_OPTIMIZER_KEY_DEFAULT_DICT
+"""
+
+from pydantic import Field, validator
+from enum import Enum
+from pathlib import Path
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel, pp_int
+
+
+class OffloadDeviceEnum(str, Enum):
+    """ Enum for valid offload devices """
+    none = "none"
+    cpu = "cpu"
+    nvme = "nvme"
+
+
+class DeepSpeedZeroOffloadParamConfig(DeepSpeedConfigModel):
+    """ Set options for parameter offload. Valid only with stage 3. """
+
+    device: OffloadDeviceEnum = "none"
+    """
+    Device memory to offload model parameters. Supported options are `cpu` and
+    `nvme`.
+    """
+
+    nvme_path: Path = None
+    """ Filesystem path for NVMe device for parameter offloading. """
+
+    buffer_count: int = Field(5, ge=0)
+    """ Number of buffers in buffer pool for parameter offloading to NVMe. """
+
+    buffer_size: int = Field(pp_int(1e8), ge=0)
+    """ Size of buffers in buffer pool for parameter offloading to NVMe. """
+
+    max_in_cpu: int = Field(pp_int(1e9), ge=0)
+    """
+    Number of parameter elements to maintain in CPU memory when offloading to
+    NVMe is enabled.
+    """
+
+    pin_memory: bool = False
+    """
+    Offload to page-locked CPU memory. This could boost throughput at the cost
+    of extra memory overhead.
+    """
+
+
+class DeepSpeedZeroOffloadOptimizerConfig(DeepSpeedConfigModel):
+    """ Set options for optimizer offload. Valid with stage 1, 2, and 3. """
+
+    device: OffloadDeviceEnum = "none"
+    """
+    Device memory to offload optimizer state. Supported options are `cpu` and
+    `nvme`. Optimizer computation is offload to CPU regardless of device option.
+    """
+
+    nvme_path: Path = None
+    """ Filesystem path for NVMe device for optimizer state offloading. """
+
+    buffer_count: int = Field(4, ge=0)
+    """
+    Number of buffers in buffer pool for optimizer state offloading to NVMe.
+    This should be at least the number of states maintained per parameter by
+    the optimizer. For example, Adam optimizer has 4 states (parameter,
+    gradient, momentum, and variance).
+    """
+
+    pin_memory: bool = False
+    """
+    Offload to page-locked CPU memory. This could boost throughput at the cost
+    of extra memory overhead.
+    """
+
+    pipeline_read: bool = False
+    """
+    For tile-based optimizer step processing, overlap read of next tile with
+    computation of current tile. Used in ZeRO-Infinity.
+    """
+
+    pipeline_write: bool = False
+    """
+    For tile-based optimizer step processing, overlap write of previous tile
+    with computation of current tile.
+    """
+
+    fast_init: bool = False
+    """ Enable fast optimizer initialization when offloading to NVMe. """
+    @validator("pipeline_read", "pipeline_write", always=True)
+    def set_pipeline(cls, field_value, values):
+        values["pipeline"] = field_value or values.get("pipeline", False)
+        return field_value
diff --git a/deepspeed/runtime/zero/parameter_offload.py b/deepspeed/runtime/zero/parameter_offload.py
new file mode 100644
index 0000000000000000000000000000000000000000..336c63e987219860d15a76c3dcf68319324fc883
--- /dev/null
+++ b/deepspeed/runtime/zero/parameter_offload.py
@@ -0,0 +1,516 @@
+"""
+"Copyright 2022 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+"""
+
+import sys
+import torch
+from collections import OrderedDict
+from deepspeed.runtime.utils import see_memory_usage
+from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
+from deepspeed.runtime.zero.partition_parameters import _init_external_params
+from deepspeed.runtime.zero.partition_parameters import *
+from deepspeed.runtime.zero.partitioned_param_coordinator import PartitionedParameterCoordinator, iter_params
+from deepspeed import comm as dist
+from deepspeed.accelerator import get_accelerator
+
+FWD_MODULE_STACK = list()
+
+
+def is_builtin_type(obj):
+    # https://stackoverflow.com/a/17795199
+    return obj.__class__.__module__ == '__builtin__' or obj.__class__.__module__ == "builtins"
+
+
+# ensure we only warn once, otherwise every iteration will trigger a warning
+warned = False
+
+
+#apply torch.autograd.Function that calls a backward_function to tensors in output
+def _apply_to_tensors_only(module, functional, backward_function, outputs):
+    if isinstance(outputs, (tuple, list)):
+        touched_outputs = []
+        for output in outputs:
+            touched_output = _apply_to_tensors_only(module,
+                                                    functional,
+                                                    backward_function,
+                                                    output)
+            touched_outputs.append(touched_output)
+        return outputs.__class__(touched_outputs)
+    elif isinstance(outputs, dict):
+        # apply inplace to avoid recreating dict inherited objects
+        for key in outputs.keys():
+            outputs[key] = _apply_to_tensors_only(module,
+                                                  functional,
+                                                  backward_function,
+                                                  outputs[key])
+        return outputs
+
+    elif isinstance(outputs, torch.Tensor):
+        # this also applies to torch.Tensor's subclasses like torch.nn.parameter.Parameter
+        touched_outputs = functional.apply(module, backward_function, outputs)
+
+        # restore zero param attributes if those get stripped by `backward_function`
+        if not is_zero_param(touched_outputs) and is_zero_param(outputs):
+            touched_outputs.ds_param_alias = outputs
+        return touched_outputs
+    else:
+        if not is_builtin_type(outputs):
+            global warned
+            if not warned and dist.get_rank() == 0:
+                logger.warning(
+                    f"A module has unknown inputs or outputs type ({type(outputs)}) and the tensors embedded in it cannot be detected. "
+                    "The ZeRO-3 hooks designed to trigger before or after backward pass of the module relies on knowing the input and "
+                    "output tensors and therefore may not get triggered properly.")
+                warned = True
+        return outputs
+
+
+#for each tensor in outputs run the forward_function and register backward_function as hook
+def _apply_forward_and_backward_to_tensors_only(module,
+                                                forward_function,
+                                                backward_function,
+                                                outputs):
+    if type(outputs) is tuple:
+        touched_outputs = []
+        for output in outputs:
+            touched_output = _apply_forward_and_backward_to_tensors_only(
+                module,
+                forward_function,
+                backward_function,
+                output)
+            touched_outputs.append(touched_output)
+        return tuple(touched_outputs)
+    elif type(outputs) is torch.Tensor:
+        forward_function(outputs)
+        if outputs.requires_grad:
+            outputs.register_hook(backward_function)
+        return outputs
+    else:
+        return outputs
+
+
+class ZeROOrderedDict(OrderedDict):
+    def __init__(self, parent_module, *args, **kwargs):
+        """A replacement for ``collections.OrderedDict`` to detect external ZeRO params.
+
+        Args:
+            parent_module (``collections.OrderedDict``): the collection to replace
+        """
+
+        super().__init__(*args, **kwargs)
+        self._parent_module = parent_module
+        self._in_forward = False
+
+    def __getitem__(self, key):
+        param = super().__getitem__(key)
+
+        # Params can be registered as None (e.g., bias)
+        if param is None:
+            return param
+
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if self._parent_module._parameters._in_forward:
+                register_external_parameter(FWD_MODULE_STACK[-1], param)
+                param.all_gather()
+                print_rank_0(
+                    f'Registering external parameter from getter {key} ds_id = {param.ds_id}',
+                    force=False)
+
+        return param
+
+
+def _inject_parameters(module, cls):
+    for module in module.modules():
+        if cls == ZeROOrderedDict:
+            new_param = cls(parent_module=module)
+        else:
+            new_param = cls()
+
+        for key, param in module._parameters.items():
+            new_param[key] = param
+        module._parameters = new_param
+
+
+class PreBackwardFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, module, pre_backward_function, outputs):
+        ctx.module = module
+        ctx.pre_backward_function = pre_backward_function
+        if not hasattr(module, "applied_pre_backward_ref_cnt"):
+            module.applied_pre_backward_ref_cnt = 0
+        module.applied_pre_backward_ref_cnt += 1
+        #print(f"After Forward: {ctx.module.__class__.__name__}")
+        outputs = outputs.detach()
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        #print(f"Before Backward: {ctx.module.__class__.__name__}")
+        ctx.pre_backward_function(ctx.module)
+        return (None, None) + args
+
+
+class PostBackwardFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, module, pre_backward_function, output):
+        ctx.module = module
+        if output.requires_grad:
+            #TODO SOME TIMES post backward does not seem to be triggered debug in detail
+            #Should only cause increase in memory not correctness issue
+            #if output.grad_fn.__class__.__name__ == 'ViewBackward':
+            #    ctx.view=True
+            #    print(f"Warning view tensor for input to module : {module.__class__.__name__}. Backward hooks may not trigger properly")
+            #assert len(module.parameters(recurse=False)), "The input tensor to the module is a view, and autograd Function or register_hook is not triggered with view tensors."
+            #if module.ds_grads_remaining == 0:
+            #    print(f"Before Forward: {ctx.module.__class__.__name__}")
+            module.ds_grads_remaining += 1
+            ctx.pre_backward_function = pre_backward_function
+        output = output.detach()
+        return output
+
+    @staticmethod
+    def backward(ctx, *args):
+        ctx.module.ds_grads_remaining = ctx.module.ds_grads_remaining - 1
+        if ctx.module.ds_grads_remaining == 0:
+            ctx.pre_backward_function(ctx.module)
+            #print(f"After Backward: {ctx.module.__class__.__name__}")
+        return (None, None) + args
+
+
+class DeepSpeedZeRoOffload(object):
+    def __init__(self,
+                 module,
+                 timers,
+                 ds_config,
+                 overlap_comm=True,
+                 prefetch_bucket_size=50000000,
+                 max_reuse_distance=1000000000,
+                 max_live_parameters=1000000000,
+                 param_persistence_threshold=100000,
+                 model_persistence_threshold=sys.maxsize,
+                 offload_param_config=None,
+                 mpu=None):
+
+        see_memory_usage("DeepSpeedZeRoOffload initialize [begin]", force=True)
+
+        print_rank_0(f"initialized {__class__.__name__} with args: {locals()}",
+                     force=False)
+
+        self.module = module
+        self.dtype = list(module.parameters())[0].dtype
+        self.offload_device = None
+        self.offload_param_pin_memory = False
+
+        if offload_param_config is not None and offload_param_config.device != OffloadDeviceEnum.none:
+            self.offload_device = offload_param_config.device
+            self.offload_param_pin_memory = offload_param_config.pin_memory
+
+        self._convert_to_zero_parameters(ds_config, module, mpu)
+
+        for m in module.modules():
+            _init_external_params(m)
+
+        _inject_parameters(module, ZeROOrderedDict)
+
+        self.param_numel_persistence_threshold = int(param_persistence_threshold)
+        self.model_persistence_threshold = int(model_persistence_threshold)
+        self.persistent_parameters = self.mark_persistent_parameters(
+            self.param_numel_persistence_threshold,
+            self.model_persistence_threshold)
+
+        self.param_coordinators = {}
+        self._prefetch_bucket_sz = int(prefetch_bucket_size)
+        self._max_reuse_distance_in_numel = int(max_reuse_distance)
+        self._max_available_parameters_in_numel = int(max_live_parameters)
+        self.__allgather_stream = get_accelerator().Stream(
+        ) if overlap_comm else get_accelerator().default_stream()
+
+        self.forward_hooks = []
+        self.backward_hooks = []
+        self.setup_zero_stage3_hooks()
+        print_rank_0(
+            f'Created module hooks: forward = {len(self.forward_hooks)}, backward = {len(self.backward_hooks)}',
+            force=False)
+
+        see_memory_usage("DeepSpeedZeRoOffload initialize [end]", force=True)
+
+    @instrument_w_nvtx
+    def partition_all_parameters(self):
+        """Partitioning Parameters that were not partitioned usually if parameters
+        of modules whose input parameters do not require grad computation do not
+        trigger post call and will therefore will remain unpartitioned"""
+        self.get_param_coordinator(training=self.module.training).release_and_reset_all(
+            self.module)
+        for param in iter_params(self.module, recurse=True):
+            if param.ds_status != ZeroParamStatus.NOT_AVAILABLE:
+                raise RuntimeError(f"{param.ds_summary()} expected to be released")
+
+    def get_param_coordinator(self, training):
+        if not training in self.param_coordinators:
+            self.param_coordinators[training] = PartitionedParameterCoordinator(
+                prefetch_bucket_sz=self._prefetch_bucket_sz,
+                max_reuse_distance_in_numel=self._max_reuse_distance_in_numel,
+                max_available_parameters_in_numel=self.
+                _max_available_parameters_in_numel,
+                allgather_stream=self.__allgather_stream,
+                prefetch_nvme=self.offload_device == OffloadDeviceEnum.nvme,
+            )
+
+        return self.param_coordinators[training]
+
+    def _convert_to_zero_parameters(self, ds_config, module, mpu):
+        non_zero_params = [p for p in module.parameters() if not is_zero_param(p)]
+        if non_zero_params:
+            zero_params = [p for p in module.parameters() if is_zero_param(p)]
+            if zero_params:
+                zero_params[0].convert_to_zero_parameters(param_list=non_zero_params)
+            else:
+                group = None
+                if mpu:
+                    group = mpu.get_data_parallel_group()
+
+                Init(module=module,
+                     data_parallel_group=group,
+                     dtype=self.dtype,
+                     config_dict_or_path=ds_config,
+                     remote_device=self.offload_device,
+                     pin_memory=self.offload_param_pin_memory,
+                     mpu=mpu)
+
+    def destroy(self):
+        self._remove_module_hooks()
+
+    def _remove_module_hooks(self):
+        num_forward_hooks = len(self.forward_hooks)
+        num_backward_hooks = len(self.backward_hooks)
+
+        for hook in self.forward_hooks:
+            hook.remove()
+
+        for hook in self.backward_hooks:
+            hook.remove()
+
+        print_rank_0(
+            f'Deleted module hooks: forward = {num_forward_hooks}, backward = {num_backward_hooks}',
+            force=False)
+
+    def setup_zero_stage3_hooks(self):
+        self.hierarchy = 0
+
+        #reset step if in inference mode
+        @instrument_w_nvtx
+        def _end_of_forward_hook(module, *args):
+
+            if not torch._C.is_grad_enabled():
+                self.get_param_coordinator(training=False).reset_step()
+
+        #likely one of them should be enough but just to be safe
+        self._register_hooks_recursively(self.module)
+        self.module.register_forward_hook(_end_of_forward_hook)
+
+        # Add top module to stack trace
+        global FWD_MODULE_STACK
+        FWD_MODULE_STACK.append(self.module)
+
+    def mark_persistent_parameters(self, param_threshold, model_threshold):
+        persistent_params = []
+        total_persistent_parameters = 0
+        params_count = 0
+        for _, param in self.module.named_parameters(recurse=True):
+            if param.ds_numel + total_persistent_parameters > model_threshold:
+                continue
+
+            if param.ds_numel < param_threshold:
+                params_count += 1
+                param.ds_persist = True
+                persistent_params.append(param)
+                total_persistent_parameters += param.ds_numel
+
+        print_rank_0(
+            f"Parameter Offload: Total persistent parameters: {total_persistent_parameters} in {params_count} params",
+            force=True)
+
+        return persistent_params
+
+    def _register_hooks_recursively(self, module, count=[0]):
+        my_count = count[0]
+        module.id = my_count
+
+        #print(f"{module.__class__} : {module.id}")
+
+        for child in module.children():
+            count[0] = count[0] + 1
+            self._register_hooks_recursively(child, count=count)
+
+        @instrument_w_nvtx
+        def _pre_forward_module_hook(module, *args):
+            self.pre_sub_module_forward_function(module)
+
+        @instrument_w_nvtx
+        def _post_forward_module_hook(module, input, output):
+            global FWD_MODULE_STACK
+            FWD_MODULE_STACK.pop()
+            if output is None:
+                output = []
+            elif not isinstance(output, (list, tuple)):
+                if torch.is_tensor(output):
+                    output = [output]
+                else:
+                    #print(f'got UNKNOWN type {type(output)}')
+                    outputs = []
+                    output = output if isinstance(output, dict) else vars(output)
+                    for name, val in output.items():
+                        if not name.startswith('__') and torch.is_tensor(val):
+                            outputs.append(val)
+                    output = outputs
+
+            for item in filter(
+                    lambda item: is_zero_param(item) or hasattr(item,
+                                                                'ds_param_alias'),
+                    output):
+                key = id(item) if hasattr(item, 'ds_id') else id(item.ds_param_alias)
+                actual_external_param = item if hasattr(item,
+                                                        'ds_id') else item.ds_param_alias
+
+                if not any(key in m._external_params for m in FWD_MODULE_STACK):
+                    actual_external_param.is_external_param = True
+                    module_to_register = FWD_MODULE_STACK[-1]
+                    register_external_parameter(module_to_register,
+                                                actual_external_param)
+                    print_rank_0(
+                        f'Registering dangling parameter for module {module_to_register.__class__.__name__}, ds_id = {actual_external_param.ds_id}.',
+                        force=False)
+
+                    # It's possible that the parameter was already external to the completed module. If so, remove it the
+                    # registration as it will be covered by the outer module instead.
+                    if key in module._external_params:
+                        print_rank_0(
+                            f'  Unregistering nested dangling parameter from module {module.__class__.__name__}, ds_id = {actual_external_param.ds_id}',
+                            force=False)
+                        unregister_external_parameter(module, actual_external_param)
+
+                    actual_external_param.all_gather()
+
+            self.post_sub_module_forward_function(module)
+
+        def _pre_backward_module_hook(module, inputs, output):
+            @instrument_w_nvtx
+            def _run_before_backward_function(sub_module):
+                # some models (e.g. Albert) may run multiple forwards on the same layer in a loop
+                # before doing backwards, so each backward will need a pre-fetch - using reference
+                # counting to support this scenario
+                #print(f"COUNTER before: {sub_module.applied_pre_backward_ref_cnt}")
+                if sub_module.applied_pre_backward_ref_cnt > 0:
+                    self.pre_sub_module_backward_function(sub_module)
+                    sub_module.applied_pre_backward_ref_cnt -= 1
+                #print(f"COUNTER after: {sub_module.applied_pre_backward_ref_cnt}")
+
+            return _apply_to_tensors_only(module,
+                                          PreBackwardFunction,
+                                          _run_before_backward_function,
+                                          output)
+
+        #This is an alternate to doing _post_backward_module_hook
+        #it uses tensor.register_hook instead of using torch.autograd.Function
+        def _alternate_post_backward_module_hook(module, inputs):
+            module.ds_grads_remaining = 0
+
+            #print(f"Before Forward {module.__class__.__name__}")
+
+            def _run_after_backward_hook(*unused):
+                module.ds_grads_remaining = module.ds_grads_remaining - 1
+                if module.ds_grads_remaining == 0:
+                    #print(f"After backward {module.__class__.__name__}")
+                    self.post_sub_module_backward_function(module)
+
+            def _run_before_forward_function(input):
+                if input.requires_grad:
+                    module.ds_grads_remaining += 1
+
+            return _apply_forward_and_backward_to_tensors_only(
+                module,
+                _run_before_forward_function,
+                _run_after_backward_hook,
+                inputs)
+
+        def _post_backward_module_hook(module, inputs):
+            module.ds_grads_remaining = 0
+
+            @instrument_w_nvtx
+            def _run_after_backward_function(sub_module):
+                if sub_module.ds_grads_remaining == 0:
+                    self.post_sub_module_backward_function(sub_module)
+
+            return _apply_to_tensors_only(module,
+                                          PostBackwardFunction,
+                                          _run_after_backward_function,
+                                          inputs)
+
+        # Pre forward hook
+        self.forward_hooks.append(
+            module.register_forward_pre_hook(_pre_forward_module_hook))
+
+        # Post forward hook
+        self.forward_hooks.append(
+            module.register_forward_hook(_post_forward_module_hook))
+
+        # Pre backward hook
+        self.backward_hooks.append(
+            module.register_forward_hook(_pre_backward_module_hook))
+
+        # post backward hook
+        self.backward_hooks.append(
+            module.register_forward_pre_hook(_post_backward_module_hook))
+
+    @torch.no_grad()
+    def pre_sub_module_forward_function(self, sub_module):
+        see_memory_usage(f"Before sub module function {sub_module.__class__.__name__}",
+                         force=False)
+
+        global FWD_MODULE_STACK
+        FWD_MODULE_STACK.append(sub_module)
+
+        param_coordinator = self.get_param_coordinator(training=sub_module.training)
+        param_coordinator.trace_prologue(sub_module)
+        if param_coordinator.is_record_trace():
+            param_coordinator.record_module(sub_module)
+        param_coordinator.fetch_sub_module(sub_module)
+
+        see_memory_usage(
+            f"Before sub module function {sub_module.__class__.__name__} after fetch",
+            force=False)
+
+    @torch.no_grad()
+    def post_sub_module_forward_function(self, sub_module):
+        see_memory_usage(
+            f"After sub module function {sub_module.__class__.__name__} {sub_module.id} before release",
+            force=False)
+
+        param_coordinator = self.get_param_coordinator(training=sub_module.training)
+        param_coordinator.release_sub_module(sub_module)
+
+        see_memory_usage(
+            f"After sub module function {sub_module.__class__.__name__}  {sub_module.id} after release",
+            force=False)
+
+    @torch.no_grad()
+    def pre_sub_module_backward_function(self, sub_module):
+        param_coordinator = self.get_param_coordinator(training=sub_module.training)
+        param_coordinator.trace_prologue(sub_module)
+        if param_coordinator.is_record_trace():
+            param_coordinator.record_module(sub_module)
+        param_coordinator.fetch_sub_module(sub_module)
+
+    @torch.no_grad()
+    def post_sub_module_backward_function(self, sub_module):
+        see_memory_usage(
+            f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} before release",
+            force=False)
+
+        self.get_param_coordinator(
+            training=sub_module.training).release_sub_module(sub_module)
+
+        see_memory_usage(
+            f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} after release",
+            force=False)
diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
old mode 100644
new mode 100755
index 80b1ee34bcec3d16ddb02cd88c4035b94671cd5a..39cfb7f90202cc760fef64aefe3e0c054c6456e3
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -5,7 +5,6 @@ Licensed under the MIT license.
 
 import math
 import os
-import time
 import types
 from typing import Callable, Iterable
 from enum import Enum
@@ -15,60 +14,40 @@ from typing import List
 
 import torch
 from torch import Tensor
-import torch.distributed as dist
-from torch.distributed.distributed_c10d import _get_global_rank, group
+from deepspeed import comm as dist
 from torch.nn import Module
 from torch.nn import Parameter
 
-from .linear import LinearModuleForZeroStage3, LinearFunctionForZeroStage3
-from .offload_constants import *
+from .linear import zero3_linear_wrap
 
 import deepspeed
 from ..utils import get_only_unique_item, see_memory_usage
 from deepspeed.runtime.zero.utils import assert_ints_same_as_other_ranks
-from deepspeed.utils import init_distributed, instrument_w_nvtx, logger
-from deepspeed.utils.debug import debug_param2name_id_shape, debug_param2name_id_shape_device, debug_module2name, debug_param2name, debug_param2name_id_shape_status, printflock, log_rank_file
-from deepspeed.utils.logging import logger
-
+from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
+from deepspeed.utils import instrument_w_nvtx, logger
+from deepspeed.comm.comm import init_distributed
+from deepspeed.utils.debug import (debug_param2name_id_shape,
+                                   debug_param2name_id_shape_device,
+                                   debug_module2name,
+                                   debug_param2name_id,
+                                   debug_param2name_id_shape_status)
+from deepspeed.accelerator import get_accelerator
 from ..swap_tensor.partitioned_param_swapper import AsyncPartitionedParameterSwapper, PartitionedParamStatus
 
 param_count = 0
 partitioned_param_data_shape = [0]
+zero_init_enabled = False
 
-if hasattr(torch.distributed, "_all_gather_base"):
 
-    def torch_allgather_fn(input_tensor: Tensor, output_tensor: Tensor, group):
-        try:
-            return instrument_w_nvtx(torch.distributed._all_gather_base)(
-                output_tensor,
-                input_tensor,
-                group=group,
-                async_op=True,
-            )
-        except RuntimeError as e:
-            raise RuntimeError(
-                f"output_tensor: {output_tensor.device}, input_tensor: {input_tensor.device}"
-            ) from e
-else:
-    logger.warning(
-        "unable to find torch.distributed._all_gather_base. will fall back to "
-        "torch.distributed.all_gather which will result in suboptimal performance. "
-        "please consider upgrading your pytorch installation.")
-
-    def torch_allgather_fn(input_tensor: Tensor, output_tensor: Tensor, group):
-        output_tensors = list(
-            torch.chunk(output_tensor,
-                        torch.distributed.get_world_size(group)))
-        return instrument_w_nvtx(torch.distributed.all_gather)(
-            output_tensors,
-            input_tensor,
-            group=group,
-            async_op=True,
-        )
+def _dist_allgather_fn(input_tensor: Tensor, output_tensor: Tensor, group=None):
+    return instrument_w_nvtx(dist.allgather_fn)(output_tensor,
+                                                input_tensor,
+                                                group=group,
+                                                async_op=True)
 
 
 def print_rank_0(message, debug=False, force=False):
-    rank = torch.distributed.get_rank()
+    rank = dist.get_rank()
     if rank == 0 and (debug or force):
         print(message)
     # other variations
@@ -79,7 +58,7 @@ def print_rank_0(message, debug=False, force=False):
 
 
 def debug_rank0(msg: str) -> None:
-    if torch.distributed.get_rank() == 0:
+    if dist.get_rank() == 0:
         logger.debug(msg)
 
 
@@ -213,7 +192,8 @@ def zero_wrapper_for_fp_tensor_constructor(fn: Callable,
                                            target_fp_dtype: torch.dtype) -> Callable:
     def wrapped_fn(*args, **kwargs) -> Tensor:
         if kwargs.get("device", None) is None:
-            kwargs['device'] = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"]))
+            kwargs['device'] = torch.device(get_accelerator().device_name(
+                os.environ["LOCAL_RANK"]))
         tensor: Tensor = fn(*args, **kwargs)
         if tensor.is_floating_point():
             tensor = tensor.to(target_fp_dtype)
@@ -225,7 +205,7 @@ def zero_wrapper_for_fp_tensor_constructor(fn: Callable,
 
 def get_new_tensor_fn_for_dtype(dtype: torch.dtype) -> Callable:
     def new_tensor(cls, *args) -> Tensor:
-        device = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"]))
+        device = torch.device(get_accelerator().device_name(os.environ["LOCAL_RANK"]))
         tensor = _orig_torch_empty(0, device=device).new_empty(*args)
         if tensor.is_floating_point():
             tensor = tensor.to(dtype)
@@ -253,10 +233,10 @@ def get_all_subclasses(cls):
 def free_param(param: Parameter) -> None:
     """Free underlying storage of a parameter."""
     assert not param.ds_active_sub_modules, param.ds_summary()
-    if param.data.is_cuda:
+    if get_accelerator().on_accelerator(param.data):
         # need to make sure that we don't free the parameter while it is still
         # being used for computation
-        param.data.record_stream(torch.cuda.current_stream())
+        param.data.record_stream(get_accelerator().current_stream())
     # param.data doesn't store anything meaningful in partitioned state
     param.data = torch.empty(0, dtype=param.dtype, device=param.device)
     param.ds_status = ZeroParamStatus.NOT_AVAILABLE
@@ -281,8 +261,10 @@ class InsertPostInitMethodToModuleSubClasses(object):
         assert self.dtype in [torch.half, torch.bfloat16, torch.float], f"Invalid data type {self.dtype}, allowed values are [torch.half, torch.bfloat16, torch.float]"
 
     def __enter__(self):
+        global zero_init_enabled
         if not self.enabled:
             return
+        zero_init_enabled = True
 
         def apply_with_gather(orig_module_apply_fn: Callable) -> Callable:
             """many models make use of child modules like Linear or Embedding which
@@ -334,9 +316,7 @@ class InsertPostInitMethodToModuleSubClasses(object):
                     fn_to_apply(module_to_apply_fn_to)
 
                     for param in params_to_apply_fn_to:
-                        torch.distributed.broadcast(param.data,
-                                                    0,
-                                                    group=param.ds_process_group)
+                        dist.broadcast(param.data, 0, group=param.ds_process_group)
 
                     for param in params_to_apply_fn_to:
                         param.partition(has_been_updated=True)
@@ -423,36 +403,15 @@ class InsertPostInitMethodToModuleSubClasses(object):
                 "nn.functional.linear has been overridden with a more memory efficient version. This will persist unless manually reset.",
                 force=False)
             self.linear_bk = torch.nn.functional.linear
-            torch.nn.functional.linear = LinearFunctionForZeroStage3.apply
+            torch.nn.functional.linear = zero3_linear_wrap
 
     def __exit__(self, exc_type, exc_value, traceback):
         if not self.enabled:
             return
 
-        def _disable_class(cls):
-            cls.__init__ = cls._old_init
-
-        # Replace .__init__() for all existing subclasses of torch.nn.Module
-        for subclass in get_all_subclasses(torch.nn.modules.module.Module):
-            _disable_class(subclass)
-
-        # putting methods back the way we found them
-        torch.nn.modules.module.Module.__init_subclass__ = torch.nn.modules.module.Module._old_init_subclass
-        torch.nn.modules.module.Module.apply = torch.nn.modules.module.Module._old_apply
-
-        torch.Tensor.__new__ = torch.Tensor.__old_new__
-        torch.empty = _orig_torch_empty
-        torch.zeros = _orig_torch_zeros
-        torch.ones = _orig_torch_ones
-        torch.full = _orig_torch_full
+        shutdown_init_context()
 
-        # un doing it here will undo it during training
-        # if self.mem_efficient_linear:
-        #    torch.nn.functional.linear = self.linear_bk
-        #        if self.mem_efficient_linear:
-        #            torch.nn.functional.linear = self.linear_bk
-
-        if torch.distributed.get_rank() == 0:
+        if dist.get_rank() == 0:
             logger.info("finished initializing model with %.2fB parameters",
                         param_count / 1e9)
 
@@ -479,6 +438,38 @@ class InsertPostInitMethodToModuleSubClasses(object):
             self.dtype = dtype or torch.half
 
 
+def shutdown_init_context():
+    global zero_init_enabled
+
+    if not zero_init_enabled:
+        return
+
+    def _disable_class(cls):
+        cls.__init__ = cls._old_init
+
+    # Replace .__init__() for all existing subclasses of torch.nn.Module
+    for subclass in get_all_subclasses(torch.nn.modules.module.Module):
+        _disable_class(subclass)
+
+    # putting methods back the way we found them
+    torch.nn.modules.module.Module.__init_subclass__ = torch.nn.modules.module.Module._old_init_subclass
+    torch.nn.modules.module.Module.apply = torch.nn.modules.module.Module._old_apply
+
+    torch.Tensor.__new__ = torch.Tensor.__old_new__
+    torch.empty = _orig_torch_empty
+    torch.zeros = _orig_torch_zeros
+    torch.ones = _orig_torch_ones
+    torch.full = _orig_torch_full
+
+    # un doing it here will undo it during training
+    # if self.mem_efficient_linear:
+    #    torch.nn.functional.linear = self.linear_bk
+    #        if self.mem_efficient_linear:
+    #            torch.nn.functional.linear = self.linear_bk
+
+    zero_init_enabled = False
+
+
 class AllGatherHandle:
     def __init__(self, handle, param: Parameter) -> None:
         if param.ds_status != ZeroParamStatus.INFLIGHT:
@@ -537,7 +528,7 @@ class AllGatherCoalescedHandle:
             param.ds_status = ZeroParamStatus.AVAILABLE
 
             for part_to_copy in partitions:
-                part_to_copy.record_stream(torch.cuda.current_stream())
+                part_to_copy.record_stream(get_accelerator().current_stream())
 
             param_offset += param.ds_tensor.ds_numel
 
@@ -566,7 +557,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
         Args:
             module (``torch.nn.Module``, optional): If provided, partition the model as
                 if it was constructed in the context.
-            data_parallel_group (``torch.distributed`` process group, optional):
+            data_parallel_group (``deepspeed.comm`` process group, optional):
                 The group of processes to partition among. Defaults to all processes.
             mem_efficient_linear (bool, optional): Replace
                 torch.nn.functional.linear with an implementation that allows
@@ -615,8 +606,8 @@ class Init(InsertPostInitMethodToModuleSubClasses):
         this feature must be used.
 
         .. note::
-            Initializes ``torch.distributed`` if it has not already been done so.
-            See :meth:`deepseed.init_distributed` for more information.
+            Initializes ``deepspeed.comm`` if it has not already been done so.
+            See :meth:`deepspeed.init_distributed` for more information.
 
         .. note::
             Can also be used as a decorator:
@@ -670,36 +661,41 @@ class Init(InsertPostInitMethodToModuleSubClasses):
                          mem_efficient_linear=mem_efficient_linear,
                          ds_config=_ds_config,
                          dtype=dtype)
-        if not torch.distributed.is_initialized():
+        if not dist.is_initialized():
             init_distributed()
-            assert torch.distributed.is_initialized(), "Parameters cannot be scattered without initializing torch.distributed"
+            assert dist.is_initialized(), "Parameters cannot be scattered without initializing deepspeed.comm"
         if data_parallel_group is None:
-            self.ds_process_group = torch.distributed.group.WORLD
+            self.ds_process_group = dist.get_world_group()
         else:
             self.ds_process_group = data_parallel_group
 
-        self.rank = torch.distributed.get_rank(group=self.ds_process_group)
-        self.world_size = torch.distributed.get_world_size(group=self.ds_process_group)
+        self.rank = dist.get_rank(group=self.ds_process_group)
+        self.world_size = dist.get_world_size(group=self.ds_process_group)
 
         # Local device is the device where the parameters are consumed, must be default device.
         # It is the device where parameters are fully instantiated using allgather
-        self.local_device = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"]))
-        torch.cuda.set_device(self.local_device)
+        self.local_device = torch.device(get_accelerator().device_name(
+            os.environ["LOCAL_RANK"]))
+        get_accelerator().set_device(self.local_device)
 
         if _ds_config is not None and _ds_config.zero_config.offload_param is not None:
-            remote_device = _ds_config.zero_config.offload_param[OFFLOAD_PARAM_DEVICE]
-            pin_memory = _ds_config.zero_config.offload_param[OFFLOAD_PARAM_PIN_MEMORY]
+            remote_device = _ds_config.zero_config.offload_param.device
+            pin_memory = _ds_config.zero_config.offload_param.pin_memory
 
         self._validate_remote_device(remote_device, _ds_config)
 
         # Remote device is the device where parameter partitions are stored
         # It can be same as local_device or it could be CPU or NVMe.
-        self.remote_device = self.local_device if remote_device is None else remote_device
-        self.pin_memory = pin_memory if (self.remote_device
-                                         == OFFLOAD_CPU_DEVICE) else False
+        self.remote_device = self.local_device if remote_device in [
+            None,
+            OffloadDeviceEnum.none
+        ] else remote_device
+        self.pin_memory = pin_memory if (
+            self.remote_device in [OffloadDeviceEnum.cpu,
+                                   OffloadDeviceEnum.nvme]) else False
 
         # Enable fp16 param swapping to NVMe
-        if self.remote_device == OFFLOAD_NVME_DEVICE:
+        if self.remote_device == OffloadDeviceEnum.nvme:
             self.param_swapper = AsyncPartitionedParameterSwapper(_ds_config, self.dtype)
         else:
             self.param_swapper = None
@@ -710,10 +706,9 @@ class Init(InsertPostInitMethodToModuleSubClasses):
             self._convert_to_zero_parameters(module.parameters(recurse=True))
 
         self.use_all_gather_base = False
-        try:
-            from torch.distributed.distributed_c10d import _all_gather_base as all_gather
+        if dist.has_allgather_base():
             self.use_all_gather_base = True
-        except:
+        else:
             logger.info(
                 f"_all_gather_base API is not available in torch {torch.__version__}")
 
@@ -726,19 +721,18 @@ class Init(InsertPostInitMethodToModuleSubClasses):
 
     def _validate_remote_device(self, remote_device, ds_config):
         if ds_config is not None:
-            if remote_device in [None, OFFLOAD_CPU_DEVICE]:
+            if remote_device in [None, OffloadDeviceEnum.cpu]:
                 if ds_config.zero_config.offload_param is not None:
-                    offload_param_device = ds_config.zero_config.offload_param[
-                        OFFLOAD_PARAM_DEVICE]
-                    assert offload_param_device != OFFLOAD_NVME_DEVICE, \
-                        f"{OFFLOAD_PARAM_DEVICE} in DeepSpeed Config cannot be {offload_param_device} if remote device is {remote_device}."
+                    offload_param_device = ds_config.zero_config.offload_param.device
+                    assert offload_param_device != OffloadDeviceEnum.nvme, \
+                        f"'device' in DeepSpeed Config cannot be {offload_param_device} if remote device is {remote_device}."
 
-            if remote_device == OFFLOAD_NVME_DEVICE:
+            if remote_device == OffloadDeviceEnum.nvme:
                 assert ds_config.zero_config.offload_param is not None, \
-                f'{OFFLOAD_PARAM} must be defined in DeepSpeed Config if remote device is {OFFLOAD_NVME_DEVICE}.'
+                f'"offload_param" must be defined in DeepSpeed Config if remote device is {OffloadDeviceEnum.nvme}.'
 
-                assert ds_config.zero_config.offload_param[OFFLOAD_PARAM_NVME_PATH] is not None, \
-                f'{OFFLOAD_PARAM_NVME_PATH} in DeepSpeed Config cannot be None if remote device is {OFFLOAD_NVME_DEVICE}'
+                assert ds_config.zero_config.offload_param.nvme_path is not None, \
+                f'"nvme_path" in DeepSpeed Config cannot be None if remote device is {OffloadDeviceEnum.nvme}'
 
     def _post_init_method(self, module):
         #see_memory_usage(f"Before converting parmas in {module.__class__.__name__}", force=False)
@@ -756,10 +750,10 @@ class Init(InsertPostInitMethodToModuleSubClasses):
                     f"Partitioning param {debug_param2name_id_shape(param)} module={debug_module2name(module)}"
                 )
 
-                if param.is_cuda:
-                    torch.distributed.broadcast(param, 0, self.ds_process_group)
+                if get_accelerator().on_accelerator(param):
+                    dist.broadcast(param, 0, self.ds_process_group)
                 else:
-                    if torch.distributed.get_rank() == 0:
+                    if dist.get_rank() == 0:
                         logger.warn(f"param `{name}` in {module.__class__.__name__} "
                                     f"not on GPU so was not broadcasted from rank 0")
 
@@ -801,7 +795,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
         # else this is set to None
         param.nvme_swapper = self.param_swapper
 
-        # DeepSped Param ID
+        # DeepSpeed Param ID
         param.ds_id = Init.param_id
         Init.param_id += 1
 
@@ -848,14 +842,13 @@ class Init(InsertPostInitMethodToModuleSubClasses):
                 param_buffer = torch.empty(
                     math.ceil(param.ds_numel / self.world_size) * self.world_size,
                     dtype=param.dtype,
-                    device=torch.cuda.current_device(),
+                    device=get_accelerator().current_device_name(),
                     requires_grad=False,
                 )
-                handle = torch_allgather_fn(
-                    param.ds_tensor.to(torch.cuda.current_device()),
+                handle = _dist_allgather_fn(
+                    param.ds_tensor.to(get_accelerator().current_device_name()),
                     param_buffer,
-                    self.ds_process_group,
-                )
+                    self.ds_process_group)
                 param.data = param_buffer.narrow(0,
                                                  0,
                                                  param.ds_numel).view(param.ds_shape).to(
@@ -866,7 +859,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
                 flat_tensor = torch.empty(partition_sz * self.world_size,
                                           dtype=get_only_unique_item(p.dtype
                                                                      for p in params),
-                                          device=torch.cuda.current_device(),
+                                          device=get_accelerator().current_device_name(),
                                           requires_grad=False)
                 partitions: List[Parameter] = []
                 for i in range(self.world_size):
@@ -875,10 +868,12 @@ class Init(InsertPostInitMethodToModuleSubClasses):
                                            partition_sz * i,
                                            partition_sz))
 
-                instrument_w_nvtx(torch.cat)(
-                    [p.ds_tensor.to(torch.cuda.current_device()) for p in params],
-                    out=partitions[self.rank])
-                handle = torch_allgather_fn(partitions[self.rank],
+                instrument_w_nvtx(torch.cat)([
+                    p.ds_tensor.to(get_accelerator().current_device_name())
+                    for p in params
+                ],
+                                             out=partitions[self.rank])
+                handle = _dist_allgather_fn(partitions[self.rank],
                                             flat_tensor,
                                             self.ds_process_group)
 
@@ -930,16 +925,16 @@ class Init(InsertPostInitMethodToModuleSubClasses):
         def padding_size():
             return self._padding_size(param)
 
-        def partitioned_size():
-            return self._partitioned_size(param)
+        def partition_numel():
+            return self._partition_numel(param)
 
         def item_override():
             param.all_gather()
             return param._orig_item()
 
-        def ds_summary(slf: torch.Tensor) -> dict:
+        def ds_summary(slf: torch.Tensor, use_debug_name: bool = False) -> dict:
             return {
-                "id": slf.ds_id,
+                "id": debug_param2name_id(slf) if use_debug_name else slf.ds_id,
                 "status": slf.ds_status.name,
                 "numel": slf.numel(),
                 "ds_numel": slf.ds_numel,
@@ -973,7 +968,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
         # Partitioning size utilities
         param.aligned_size = aligned_size
         param.padding_size = padding_size
-        param.partitioned_size = partitioned_size
+        param.partition_numel = partition_numel
         param.ds_summary = types.MethodType(ds_summary, param)
 
         param.item = allgather_before(param.item)
@@ -987,7 +982,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
         remainder = param.ds_numel % self.world_size
         return (self.world_size - remainder) if remainder else 0
 
-    def _partitioned_size(self, param):
+    def _partition_numel(self, param):
         return param.ds_tensor.ds_numel
 
     def _ensure_availability_of_partitioned_params(self, params):
@@ -995,10 +990,10 @@ class Init(InsertPostInitMethodToModuleSubClasses):
         swap_in_flight = []
         for param in params:
             if param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE:
-                assert param.ds_tensor.final_location == OFFLOAD_NVME_DEVICE and param.ds_status == ZeroParamStatus.NOT_AVAILABLE
+                assert param.ds_tensor.final_location == OffloadDeviceEnum.nvme and param.ds_status == ZeroParamStatus.NOT_AVAILABLE
                 swap_in_list.append(param)
             if param.ds_tensor.status == PartitionedParamStatus.INFLIGHT:
-                assert param.ds_tensor.final_location == OFFLOAD_NVME_DEVICE and param.ds_status == ZeroParamStatus.NOT_AVAILABLE
+                assert param.ds_tensor.final_location == OffloadDeviceEnum.nvme and param.ds_status == ZeroParamStatus.NOT_AVAILABLE
                 swap_in_flight.append(param)
         if len(swap_in_list) > 0:
             swap_in_list[0].nvme_swapper.swap_in(swap_in_list, async_op=False)
@@ -1067,7 +1062,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
             #     if numel in empty_buffers:
             #         empty_buffers[numel].append(buffer)
 
-            # if torch.distributed.get_rank():
+            # if deepspeed.comm.get_rank():
             #    print(f"Releasing {param.data.numel()}")
             if param.ds_tensor is not None and not has_been_updated:
 
@@ -1081,7 +1076,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
                 see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}',
                                  force=False)
 
-                if param.ds_tensor.final_location == OFFLOAD_NVME_DEVICE:
+                if param.ds_tensor.final_location == OffloadDeviceEnum.nvme:
                     print_rank_0(
                         f"Param {param.ds_id} partition released since it exists in nvme",
                         force=False)
@@ -1094,9 +1089,9 @@ class Init(InsertPostInitMethodToModuleSubClasses):
 
             if param.ds_tensor is None:
                 final_location = None
-                if self.remote_device == OFFLOAD_NVME_DEVICE and self.param_swapper.swappable_tensor(
+                if self.remote_device == OffloadDeviceEnum.nvme and self.param_swapper.swappable_tensor(
                         numel=partition_size):
-                    final_location = OFFLOAD_NVME_DEVICE
+                    final_location = OffloadDeviceEnum.nvme
                     buffer = self.param_swapper.get_buffer(param, partition_size)
                     partitioned_tensor = torch.empty(0,
                                                      dtype=param.dtype,
@@ -1110,10 +1105,11 @@ class Init(InsertPostInitMethodToModuleSubClasses):
                     partitioned_tensor = torch.empty(
                         partition_size,
                         dtype=param.dtype,
-                        device=OFFLOAD_CPU_DEVICE if self.remote_device
-                        == OFFLOAD_NVME_DEVICE else self.remote_device)
+                        device=OffloadDeviceEnum.cpu if self.remote_device
+                        == OffloadDeviceEnum.nvme else self.remote_device)
                     if self.pin_memory:
-                        partitioned_tensor = partitioned_tensor.pin_memory()
+                        partitioned_tensor = get_accelerator().pin_memory(
+                            partitioned_tensor)
 
                 partitioned_tensor.requires_grad = False
                 param.ds_tensor = partitioned_tensor
@@ -1161,7 +1157,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
             see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}',
                              force=False)
 
-            if param.ds_tensor.final_location == OFFLOAD_NVME_DEVICE:
+            if param.ds_tensor.final_location == OffloadDeviceEnum.nvme:
                 self.param_swapper.swap_out_and_release([param])
                 print_rank_0(
                     f"ID {param.ds_id} Offloaded to nvme offload and buffers released.")
@@ -1205,7 +1201,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
             f'After allocate allgather param {debug_param2name_id_shape_status(param)} {aligned_param_size} {partition_size} ',
             force=False)
 
-        torch.cuda.synchronize()
+        get_accelerator().synchronize()
 
         print_rank_0(
             f"{'--'* hierarchy}----allgather param with {debug_param2name_id_shape_status(param)} partition size={partition_size}"
@@ -1218,10 +1214,11 @@ class Init(InsertPostInitMethodToModuleSubClasses):
         #            return None
         if self.use_all_gather_base:
             # try the _all_gather_base on PyTorch master branch
-            handle = dist._all_gather_base(flat_tensor,
-                                           param.ds_tensor.cuda(),
-                                           group=self.ds_process_group,
-                                           async_op=async_op)
+            handle = dist.all_gather_base(flat_tensor,
+                                          param.ds_tensor.to(
+                                              get_accelerator().device_name()),
+                                          group=self.ds_process_group,
+                                          async_op=async_op)
         else:
             partitions = []
             for i in range(self.world_size):
@@ -1253,7 +1250,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
         local_tensors = []
         for param in param_list:
             partition_sizes.append(param.ds_tensor.ds_numel)
-            local_tensors.append(param.ds_tensor.cuda())
+            local_tensors.append(param.ds_tensor.to(get_accelerator().device_name()))
 
         # allocate memory for allgather params
         allgather_params = []
@@ -1274,17 +1271,17 @@ class Init(InsertPostInitMethodToModuleSubClasses):
 
             if self.use_all_gather_base:
                 # try the _all_gather_base from Pytorch master
-                h = dist._all_gather_base(allgather_params[param_idx],
-                                          input_tensor,
-                                          group=self.ds_process_group,
-                                          async_op=True)
+                h = dist.all_gather_base(allgather_params[param_idx],
+                                         input_tensor,
+                                         group=self.ds_process_group,
+                                         async_op=True)
             else:
                 output_list = []
                 for i in range(self.world_size):
                     psize = partition_sizes[param_idx]
                     partition = allgather_params[param_idx].narrow(0, i * psize, psize)
                     output_list.append(partition)
-                    if not partition.is_cuda:
+                    if not get_accelerator().on_accelerator(partition):
                         logger.warning(
                             f'param {param_idx}, partition {i} is not on CUDA, partition shape {partition.size()}'
                         )
@@ -1307,7 +1304,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
                                                 param.ds_numel).view(param.ds_shape).data
 
         # guarantee the communication to be completed
-        torch.cuda.synchronize()
+        get_accelerator().synchronize()
 
         return None
 
@@ -1321,7 +1318,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
         flat_tensor = torch.empty(tensor_size,
                                   dtype=param_list[0].dtype,
                                   device=self.local_device)
-        flat_tensor.requres_grad = False
+        flat_tensor.requires_grad = False
         partitions = []
         for i in range(self.world_size):
             start = partition_size * i
@@ -1339,10 +1336,10 @@ class Init(InsertPostInitMethodToModuleSubClasses):
 
                     offset += param_numel
 
-        torch.distributed.all_gather(partitions,
-                                     partitions[self.rank],
-                                     group=self.ds_process_group,
-                                     async_op=False)
+        dist.all_gather(partitions,
+                        partitions[self.rank],
+                        group=self.ds_process_group,
+                        async_op=False)
         param_offset = 0
 
         for param in param_list:
@@ -1436,11 +1433,11 @@ class Init(InsertPostInitMethodToModuleSubClasses):
             #print("after reduce scatter gradients")
             input_list.append(input)
 
-        rank = torch.distributed.get_rank(group=self.ds_process_group)
-        handle = torch.distributed.reduce_scatter(input_list[rank],
-                                                  input_list,
-                                                  group=self.ds_process_group,
-                                                  async_op=True)
+        rank = dist.get_rank(group=self.ds_process_group)
+        handle = dist.reduce_scatter(input_list[rank],
+                                     input_list,
+                                     group=self.ds_process_group,
+                                     async_op=True)
 
         return handle, input_list[rank]
 
@@ -1472,7 +1469,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
             assert partition_buffer.numel(
             ) >= partition_size, f"The partition buffer size {partition_buffer.numel()} should match the size of param.ds_tensor {partition_size}"
 
-        rank = torch.distributed.get_rank(group=self.ds_process_group)
+        rank = dist.get_rank(group=self.ds_process_group)
         start = partition_size * rank
         end = start + partition_size
 
@@ -1526,7 +1523,7 @@ class GatheredParameters:
         again upon exit.
 
         Args:
-            params (``torch.nn.Parameter``): A single parameter or a list of parameters to collect.
+            params (``torch.nn.Parameter``): A single parameter, or an iterable of parameters (list, tuple, generator) of parameters to collect.
                 It's assumed that all parameters are zero params.
             modifier_rank (int, optional): If specified, this rank's parameter will be
                 broadcasted on exit from the context. This argument is required if ``params`` are
@@ -1536,9 +1533,11 @@ class GatheredParameters:
                 registered as external parameters of ``fwd_module``. See :meth:`deepspeed.zero.register_external_parameter`.
             enabled (bool, optional): If ``False``, this context is a no-op. Defaults to ``True``.
 
-        Important: Make sure to use ``modifier_rank`` that is not ``None`` (e.g. ``modifier_rank=0``)
+        Important: Make sure to use ``modifier_rank`` that is not ``None`` (e.g., ``modifier_rank=0``)
         if you need the GPU memory allocated by gather to be released upon exit from the context manager.
 
+        Important: if ``params`` isn't an iterable of parameters or a single parameter it'll be silently ignored!
+
         Examples
         ========
 
@@ -1552,12 +1551,12 @@ class GatheredParameters:
 
                 with deepspeed.zero.GatheredParameters(linear.weight,
                                                        modifier_rank=0):
-                    if torch.distributed.get_rank() == 0:
+                    if deepspeed.comm.get_rank() == 0:
                         linear.weight.zero_()
 
                 with deepspeed.zero.GatheredParameters(linear.weight,
                                                        modifier_rank=0):
-                    if torch.distributed.get_rank() == 0:
+                    if deepspeed.comm.get_rank() == 0:
                         linear.weight.zero_()
 
         #. Collect a partitioned weight to pass to another module during
@@ -1591,7 +1590,7 @@ class GatheredParameters:
                     # manager gathers (unpartitions) the params of the current layer, then loads from
                     # the state dict and then re-partitions them again
                     with deepspeed.zero.GatheredParameters(list(module.parameters(recurse=False)), modifier_rank=0):
-                        if torch.distributed.get_rank() == 0:
+                        if deepspeed.comm.get_rank() == 0:
                             module._load_from_state_dict(state_dict, prefix)
 
                     for name, child in module._modules.items():
@@ -1600,15 +1599,20 @@ class GatheredParameters:
 
                 load(model, prefix="")
 
-        If this approach is not used, then the full model will first get copied to each GPU. For models
-        bigger than the memory of a single gpu this method is required.
+        If this approach is not used, then the full model will first be copied to each GPU. For models
+        bigger than the memory of a single GPU, this method is required.
         """
 
         self.enabled = enabled
         if not enabled:
             return
 
-        if not isinstance(params, list):
+        if isinstance(params, Iterable) and not isinstance(params, torch.Tensor):
+            # deal with generators like model.parameters()
+            # must convert to list to be able to iterate more than once if we get a generator
+            params = list(params)
+        else:
+            # single param
             params = [params]
 
         # enable if at least one is zero-param, otherwise a noop
@@ -1619,12 +1623,12 @@ class GatheredParameters:
         self.params = [p for p in params if hasattr(p, "ds_id")]
         self.src_rank = None
         if modifier_rank is not None:
-            if self.params[0].ds_process_group == torch.distributed.group.WORLD:
+            if self.params[0].ds_process_group == dist.get_world_group():
                 self.src_rank = modifier_rank
             else:
                 # A group was specified; convert DP rank to global rank
-                self.src_rank = _get_global_rank(self.params[0].ds_process_group,
-                                                 modifier_rank)
+                self.src_rank = dist.get_global_rank(self.params[0].ds_process_group,
+                                                     modifier_rank)
         self.fwd_module = fwd_module
         if self.fwd_module is not None:
             # is a no-op if already registered
@@ -1640,13 +1644,14 @@ class GatheredParameters:
         if not self.enabled:
             return
         if self.src_rank is None:
+            self.params[0].partition(param_list=self.params, has_been_updated=False)
             return
 
         handles = [
-            torch.distributed.broadcast(p,
-                                        self.src_rank,
-                                        group=p.ds_process_group,
-                                        async_op=True) for p in self.params
+            dist.broadcast(p,
+                           self.src_rank,
+                           group=p.ds_process_group,
+                           async_op=True) for p in self.params
         ]
         for h in handles:
             h.wait()
diff --git a/deepspeed/runtime/zero/partitioned_param_coordinator.py b/deepspeed/runtime/zero/partitioned_param_coordinator.py
index 4958dcf3c3a4fadba84952147ce74a955cc46a6f..f6edfe626f0bdf1afa004fc34d798a9888598fd2 100644
--- a/deepspeed/runtime/zero/partitioned_param_coordinator.py
+++ b/deepspeed/runtime/zero/partitioned_param_coordinator.py
@@ -4,18 +4,17 @@ Licensed under the MIT license.
 """
 
 from dataclasses import dataclass
-import functools
 import collections
-from collections import OrderedDict, UserDict
-from typing import Deque, Dict, Iterable, Set, Tuple
-import torch
-from torch.cuda import Event, Stream
-from torch.nn import Module, Parameter
+from collections import UserDict
+from typing import Deque, Set
 
+from deepspeed import comm as dist
 from deepspeed.utils.logging import logger
+from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
 from deepspeed.runtime.zero.partition_parameters import *
-from deepspeed.runtime.zero.offload_constants import *
 from deepspeed.runtime.swap_tensor.partitioned_param_swapper import PartitionedParamStatus
+from deepspeed.utils.debug import debug_module2name_id, debug_param2name_id
+from deepspeed.accelerator import get_accelerator
 
 
 def debug_rank0(message: str) -> None:
@@ -33,7 +32,7 @@ def iter_params(module: Module, recurse=False) -> Iterable[Parameter]:
     return map(lambda pair: pair[1], get_all_parameters(module, recurse))
 
 
-class TraceMode(Enum):
+class ZeRoTraceMode(Enum):
     # Record trace of the network during a single forward+backward (for training) or forward (for inference)
     RECORD = 1
     # Use recorded network trace to optimize current forward+backward or forward
@@ -67,7 +66,7 @@ class PartitionedParameterCoordinator:
         prefetch_bucket_sz: int,
         max_reuse_distance_in_numel: int,
         max_available_parameters_in_numel: int,
-        allgather_stream: Stream,
+        allgather_stream: get_accelerator().Stream,
         prefetch_nvme: bool = False,
     ) -> None:
         # mapping of param -> handle for each param that is currently in flight
@@ -75,12 +74,14 @@ class PartitionedParameterCoordinator:
         # keeps track of the number of submodules invoked so far.
         self.__step_id: int = 0
         # network tracing mode
-        self.__trace_mode: TraceMode = TraceMode.RECORD
+        self.__trace_mode: ZeRoTraceMode = ZeRoTraceMode.RECORD
         # sequence of submodules/parameters in forward pass + backward pass
         self.__submodule_order: Iterable[Module] = []
         self.__param_order: Iterable[__class__.__ParamInTrace] = []
         self.__most_recent_step_id_param_fetched_for = collections.defaultdict(
             lambda: int(-1e10))
+        self.__step_id_module_fetched_for = collections.defaultdict(
+            lambda: collections.deque())
         # number of available params, and max number of available params
         self.__n_available_params: int = 0
         self.__max_n_available_params: int = max_available_parameters_in_numel
@@ -94,7 +95,7 @@ class PartitionedParameterCoordinator:
         self.hierarchy: int = 0
 
         # stream that will be used for allgather operations
-        self.__allgather_stream: Stream = allgather_stream
+        self.__allgather_stream: get_accelerator().Stream = allgather_stream
 
         # limit the number of fetch events that can be queued at once
         # otherwise, what happens is memory is allocated by the host thread at the
@@ -105,7 +106,7 @@ class PartitionedParameterCoordinator:
         # cudaMallocAsync/cudaFreeAsync. Choosing to not expose this to the user now
         # because ideally in the future its replaced by an async allocation
         # mechanism which doesn't require any configuration by the user.
-        self.__ongoing_fetch_events: Deque[Event] = collections.deque()
+        self.__ongoing_fetch_events: Deque[get_accelerator().Event] = collections.deque()
         # TODO. make this configurable via JSON
         self.__max_ongoing_fetch_events: int = 2
 
@@ -126,24 +127,29 @@ class PartitionedParameterCoordinator:
         self.__param_queue = None
 
     def is_complete_trace(self) -> bool:
-        return self.__trace_mode == TraceMode.COMPLETE
+        return self.__trace_mode == ZeRoTraceMode.COMPLETE
 
     def is_invalid_trace(self) -> bool:
-        return self.__trace_mode == TraceMode.INVALID
+        return self.__trace_mode == ZeRoTraceMode.INVALID
 
     def is_record_trace(self) -> bool:
-        return self.__trace_mode == TraceMode.RECORD
+        return self.__trace_mode == ZeRoTraceMode.RECORD
 
     def _invalidate_trace(self) -> None:
         if self.is_invalid_trace():
             raise RuntimeError("attempted to invalidate already invalid trace")
-        self.__trace_mode = TraceMode.INVALID
+        self.__trace_mode = ZeRoTraceMode.INVALID
         self._clear_trace_structures()
 
     def trace_prologue(self, sub_module: Module) -> None:
         if self.is_complete_trace():
             # sub_module must match expectation else invalidate trace cache
             if sub_module != self.__submodule_order[self.__step_id]:
+                expected_module_id = self.__submodule_order[self.__step_id].id
+                debug_rank0(
+                    f"Invalidate trace cache @ step {self.__step_id}: "
+                    f"expected module {expected_module_id}, but got module {sub_module.id}"
+                )
                 self._invalidate_trace()
 
     def record_module(self, sub_module: Module) -> None:
@@ -151,17 +157,27 @@ class PartitionedParameterCoordinator:
         if not self.is_record_trace():
             raise RuntimeError(
                 f"attempted to record trace when status = {self.__trace_mode}")
+
         self.__submodule_order.append(sub_module)
+        self.__step_id_module_fetched_for[sub_module.id].append(self.__step_id)
 
     def record_parameters(self, sub_module: Module) -> None:
         """adds sub module to trace"""
         if not self.is_record_trace():
             raise RuntimeError(
                 f"attempted to record trace when status = {self.__trace_mode}")
+
+        step_id = self.__step_id_module_fetched_for[sub_module.id].popleft()
         for param in sorted(set(iter_params(sub_module)), key=lambda p: p.ds_id):
             self.__param_order.append(
                 __class__.__ParamInTrace(param=param,
-                                         step_id_last_used_at=self.__step_id))
+                                         step_id_last_used_at=step_id))
+
+    def construct_parameter_trace_from_module_trace(self):
+        """use module trace to construct parameter trace"""
+        self.__param_order = []
+        for sub_module in self.__submodule_order:
+            self.record_parameters(sub_module)
 
     def reset_step(self) -> None:
         """indicate that we have completed one fwd+bwd for the model"""
@@ -171,31 +187,49 @@ class PartitionedParameterCoordinator:
                 f"{[p.ds_summary for p in self.__inflight_param_registry.keys()]}")
 
         if not self.is_complete_trace():  # not self.trace_complete:
-            # Make sure that recorded parameter and submodule orders are
-            # identical across ranks
+            # Make sure that recorded submodule orders are identical across ranks
             assert_ints_same_as_other_ranks([m.id for m in self.__submodule_order])
-            assert_ints_same_as_other_ranks([p.param.ds_id for p in self.__param_order])
-            assert_ints_same_as_other_ranks(
-                [p.step_id_last_used_at for p in self.__param_order])
 
             if self.is_record_trace():
                 # Successfully recorded a trace
+                self.construct_parameter_trace_from_module_trace()
+                # Make sure that recorded parameter orders are identical across ranks
+                assert_ints_same_as_other_ranks(
+                    [p.param.ds_id for p in self.__param_order])
+                assert_ints_same_as_other_ranks(
+                    [p.step_id_last_used_at for p in self.__param_order])
+
                 self.__submodule_order = tuple(self.__submodule_order)  # freeze
                 self.__param_order = tuple(self.__param_order)  # freeze
-                self.__trace_mode = TraceMode.COMPLETE  # self.trace_complete = True
+                self.__trace_mode = ZeRoTraceMode.COMPLETE
                 print_rank_0(
-                    f"completed trace: {[m.id for m in self.__submodule_order]}",
+                    f"completed record trace: {[m.id for m in self.__submodule_order]}",
                     force=False)
             else:
                 # Enable trace recording for next forward/backward pass
-                self.__trace_mode = TraceMode.RECORD
+                self.__trace_mode = ZeRoTraceMode.RECORD
 
         self.__param_queue = collections.deque(self.__param_order)  # reset fetch queue
         self.__most_recent_step_id_param_fetched_for = collections.defaultdict(
             lambda: int(-1e10))
+        self.__step_id_module_fetched_for = collections.defaultdict(
+            lambda: collections.deque())
         self.__step_id = 0
         self.__n_available_params = 0
 
+    def _dump_params(self, tag, sub_module, params, step_id=None):
+        if step_id is None:
+            step_id = self.__step_id
+        param_names = [debug_param2name_id(p) for p in params]
+        print(
+            f'{tag} step = {step_id} mod = {debug_module2name_id(sub_module)} p_names = {param_names}'
+        )
+
+    def _dump_param_ids(self, tag, mod_id, p_ids, step_id=None):
+        if step_id is None:
+            step_id = self.__step_id
+        print(f'{tag} mod = {mod_id}, step = {step_id}, p_ids = {p_ids}')
+
     """Fetch and Release
     Fetching, prefetching, and releasing parameters
     """
@@ -228,7 +262,7 @@ class PartitionedParameterCoordinator:
             param.ds_active_sub_modules.add(current_submodule.id)
             debug_rank0(f"-wait: {param.ds_summary()}")
             if param in self.__inflight_param_registry:
-                with torch.cuda.stream(self.__allgather_stream):
+                with get_accelerator().stream(self.__allgather_stream):
                     while self.__ongoing_fetch_events and self.__ongoing_fetch_events[
                             0].query():
                         self.__ongoing_fetch_events.popleft()
@@ -238,12 +272,12 @@ class PartitionedParameterCoordinator:
 
                     self.__inflight_param_registry.pop(param).wait()
 
-                    event = Event()
+                    event = get_accelerator().Event()
                     event.record()
                     self.__ongoing_fetch_events.append(event)
 
             assert param.ds_status == ZeroParamStatus.AVAILABLE, param.ds_summary()
-        torch.cuda.current_stream().wait_stream(self.__allgather_stream)
+        get_accelerator().current_stream().wait_stream(self.__allgather_stream)
 
         # kick off parameter prefetches for upcoming modules
         # don't prefetch if we dont have a completed model trace
@@ -264,15 +298,23 @@ class PartitionedParameterCoordinator:
                 self.__most_recent_step_id_param_fetched_for[
                     param_in_trace.param] = param_in_trace.step_id_last_used_at
                 discarded_from_prefetch_queue.add(param_in_trace.param)
+
             if discarded_from_prefetch_queue != params_not_already_fetched:
                 raise RuntimeError(
                     f"tracing error at step {self.__step_id}: \n"
                     f"module id: {current_submodule.id}, training: {current_submodule.training}\n"
                     f"expected the next {len(params_not_already_fetched)} parameters in the "
-                    f"parameter fetch queue to be {tuple(p.ds_summary() for p in params_not_already_fetched)} \n"
-                    f"but got \n {tuple(p.ds_summary() for p in discarded_from_prefetch_queue)}."
+                    f"parameter fetch queue to be {tuple(p.ds_summary(use_debug_name=True) for p in params_not_already_fetched)} \n"
+                    f"but got \n {tuple(p.ds_summary(use_debug_name=True) for p in discarded_from_prefetch_queue)}."
                 )
 
+            def _is_currently_on_nvme(param):
+                if param.nvme_swapper is None:
+                    return False
+
+                return param.ds_tensor.final_location == OffloadDeviceEnum.nvme \
+                    and param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE
+
             # kick off all gather for params in the next few submodules (prefetch)
             if self.__prefetch_bucket_sz > 0:
                 max_params_to_prefetch = min(
@@ -283,11 +325,25 @@ class PartitionedParameterCoordinator:
                 while self.__param_queue and numel_prefetching < max_params_to_prefetch:
                     param_in_trace: __class__.__ParamInTrace = self.__param_queue.popleft(
                     )
-                    self.__most_recent_step_id_param_fetched_for[
-                        param_in_trace.param] = param_in_trace.step_id_last_used_at
-                    if param_in_trace.param not in params_to_prefetch:
+
+                    if _is_currently_on_nvme(param_in_trace.param):
+                        # nvme prefetch is handled elsewhere. Need to break here to preserve fetch order
+                        self.__param_queue.appendleft(param_in_trace)
+                        break
+
+                    do_prefetch = param_in_trace.param.ds_status == ZeroParamStatus.NOT_AVAILABLE
+                    if param_in_trace.param in params_to_prefetch:
+                        # Avoid duplicates
+                        do_prefetch = False
+
+                    self.__most_recent_step_id_param_fetched_for[param_in_trace.param] = \
+                        max(self.__most_recent_step_id_param_fetched_for[param_in_trace.param],
+                            param_in_trace.step_id_last_used_at)
+
+                    if do_prefetch:
                         params_to_prefetch.add(param_in_trace.param)
                         numel_prefetching += param_in_trace.param.ds_numel
+
                 for param in params_to_prefetch:
                     debug_rank0(f"-prefetch: {param.ds_summary()}")
                 self.__all_gather_params(params_to_prefetch)
@@ -339,13 +395,23 @@ class PartitionedParameterCoordinator:
                 self.__n_available_params += param.ds_numel
 
         if partitioned_params:
-            with torch.cuda.stream(self.__allgather_stream):
+            with get_accelerator().stream(self.__allgather_stream):
                 handle = partitioned_params[0].all_gather_coalesced(partitioned_params)
 
             for param in partitioned_params:
                 assert param.ds_status == ZeroParamStatus.INFLIGHT, param.ds_summary()
                 self.__inflight_param_registry[param] = handle
 
+            # Release swap buffers for persisted params on nvme since they will never be partitioned or evicted from GPU
+            swap_persisted_params = [
+                p for p in partitioned_params
+                if p.ds_persist and p.ds_tensor.final_location == OffloadDeviceEnum.nvme
+            ]
+            if swap_persisted_params:
+                swap_persisted_params[
+                    0].nvme_swapper.remove_partition_and_release_buffers(
+                        swap_persisted_params)
+
     @instrument_w_nvtx
     def __release_param(self, param: Parameter) -> None:
         if param.ds_status == ZeroParamStatus.AVAILABLE and not param.ds_active_sub_modules:
@@ -364,12 +430,23 @@ class PartitionedParameterCoordinator:
         params_to_release = set(p.ds_id for p in iter_params(submodule_to_release)
                                 if not p.ds_persist)
 
+        # Problem: When prefetcher scans the param trace, it skips AVAILABLE params.
+        # This creates issues if those params are released before the skipped uses:
+        # 1) It hurts performance as the skipped uses are never prefetched.
+        # 2) For nvme params, we run out of swap buffers because the prefetch order
+        # diverges from the trace.
+        # Solution: Don't release params whose reuse was skipped by prefetch. This is
+        # possible because we detect such skips during prefetch and mark those params.
+        for param in iter_params(submodule_to_release):
+            if self.__most_recent_step_id_param_fetched_for[param] > step_id:
+                params_to_release.discard(param.ds_id)
+
         # examine all modules within `max_reuse_dist_in_numel` of the current step,
         # if we see any of the candidate parameters to be released reoccur while
         # doing this, remove them from the set of parameters to release.
         params_traversed = 0
         for module in self.__submodule_order[step_id:]:
-            if params_traversed > self.__max_reuse_dist_in_numel:
+            if params_traversed >= self.__max_reuse_dist_in_numel:
                 break
             for param in iter_params(module):
                 params_to_release.discard(param.ds_id)
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 836b1ee0a6489944b1188b05aa52fa118bf4b9fc..9f7cb384a74afad70a2d5088e4368fe673ecdf78 100644
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -3,49 +3,35 @@
 Licensed under the MIT license.
 """
 
+import sys
 import gc
-from dataclasses import dataclass
-import functools
-import os
 import collections
-from collections import OrderedDict, UserDict
-import itertools
-from typing import Deque, Dict, Iterable, Set, Tuple
-import torch
-from torch.cuda import Event, Stream
-from torch.nn import Module, Parameter
-import torch.distributed as dist
-import math
-from torch._six import inf
-from torch.nn import Module
-from torch.nn.parameter import Parameter
-
-from deepspeed.utils.logging import logger
-from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler
+from typing import Deque, Dict, Tuple
+
+from deepspeed.runtime import ZeROOptimizer
+from deepspeed.utils import logger
+from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler
 from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced
-from deepspeed.runtime.utils import get_global_norm, see_memory_usage, is_model_parallel_parameter, DummyOptim
+from deepspeed.runtime.utils import inf, get_global_norm, is_model_parallel_parameter
 from deepspeed.runtime.zero.partition_parameters import *
-from deepspeed.runtime.zero.partition_parameters import _init_external_params
-from deepspeed.runtime.zero.constants import ZERO_OPTIMIZATION_WEIGHTS
+from deepspeed.runtime.zero.config import ZeroStageEnum
+from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
+from deepspeed.runtime.zero.parameter_offload import DeepSpeedZeRoOffload
 from deepspeed.ops.adam import DeepSpeedCPUAdam
-from deepspeed.ops.op_builder import UtilsBuilder
-from deepspeed.runtime.zero.offload_constants import *
-from deepspeed.runtime.zero.partitioned_param_coordinator import PartitionedParameterCoordinator, iter_params
 from deepspeed.runtime.swap_tensor.partitioned_param_swapper import PartitionedParamStatus
 from deepspeed.runtime.swap_tensor.partitioned_optimizer_swapper import PartitionedOptimizerSwapper
 from deepspeed.runtime.swap_tensor.pipelined_optimizer_swapper import PipelinedOptimizerSwapper
 from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, FP32_FLAT_GROUPS, PARTITION_COUNT, ZERO_STAGE
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import UtilsBuilder
 
 # Toggle this to true to enable correctness test
 # with gradient partitioning and without
 pg_correctness_test = False
 
-FWD_MODULE_STACK = list()
-from deepspeed.utils.debug import debug_module2name_id, debug_param2name_id, debug_param2name_id_numel, debug_param2name_id_shape_device, debug_module2name_class, printflock, log_rank_file
-
 
 def print_rank_0(message, debug=False, force=False):
-    rank = torch.distributed.get_rank()
+    rank = dist.get_rank()
     if rank == 0 and (debug or force):
         print(message)
     # other variations
@@ -73,157 +59,10 @@ def move_to_cpu(tensor_list):
         tensor.data = tensor.data.cpu()
 
 
-def is_builtin_type(obj):
-    # https://stackoverflow.com/a/17795199
-    return obj.__class__.__module__ == '__builtin__' or obj.__class__.__module__ == "builtins"
-
-
-#apply torch.autograd.Function that calls a backward_function to tensors in output
-def _apply_to_tensors_only(module, functional, backward_function, outputs):
-    if isinstance(outputs, (tuple, list)):
-        touched_outputs = []
-        for output in outputs:
-            touched_output = _apply_to_tensors_only(module,
-                                                    functional,
-                                                    backward_function,
-                                                    output)
-            touched_outputs.append(touched_output)
-        return outputs.__class__(touched_outputs)
-    elif isinstance(outputs, dict):
-        # apply inplace to avoid recreating dict inherited objects
-        for key in outputs.keys():
-            outputs[key] = _apply_to_tensors_only(module,
-                                                  functional,
-                                                  backward_function,
-                                                  outputs[key])
-        return outputs
-    elif type(outputs) is torch.Tensor:
-        return functional.apply(module, backward_function, outputs)
-    else:
-        if not is_builtin_type(outputs):
-            logger.warning(
-                f"A module has unknown inputs or outputs type ({type(outputs)}) and the tensors embedded in it cannot be detected. "
-                "The ZeRO-3 hooks designed to trigger before or after backward pass of the module relies on knowing the input and "
-                "output tensors and therefore may not get triggered properly.")
-        return outputs
-
-
-#for each tensor in outputs run the forward_function and register backward_function as hook
-def _apply_forward_and_backward_to_tensors_only(module,
-                                                forward_function,
-                                                backward_function,
-                                                outputs):
-    if type(outputs) is tuple:
-        touched_outputs = []
-        for output in outputs:
-            touched_output = _apply_forward_and_backward_to_tensors_only(
-                module,
-                forward_function,
-                backward_function,
-                output)
-            touched_outputs.append(touched_output)
-        return tuple(touched_outputs)
-    elif type(outputs) is torch.Tensor:
-        forward_function(outputs)
-        if outputs.requires_grad:
-            outputs.register_hook(backward_function)
-        return outputs
-    else:
-        return outputs
-
-
-class ZeROOrderedDict(OrderedDict):
-    def __init__(self, parent_module, *args, **kwargs):
-        """A replacement for ``collections.OrderedDict`` to detect external ZeRO params.
-
-        Args:
-            parent_module (``collections.OrderedDict``): the collection to replace
-        """
-
-        super().__init__(*args, **kwargs)
-        self._parent_module = parent_module
-        self._in_forward = False
-
-    def __getitem__(self, key):
-        param = super().__getitem__(key)
-
-        # Params can be registered as None (e.g., bias)
-        if param is None:
-            return param
-
-        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
-            if self._parent_module._parameters._in_forward:
-                register_external_parameter(FWD_MODULE_STACK[-1], param)
-                param.all_gather()
-                print_rank_0(
-                    f'Registering external parameter from getter {key} ds_id = {param.ds_id}',
-                    force=False)
-
-        return param
-
-
-def _inject_parameters(module, cls):
-    for module in module.modules():
-        if cls == ZeROOrderedDict:
-            new_param = cls(parent_module=module)
-        else:
-            new_param = cls()
-
-        for key, param in module._parameters.items():
-            new_param[key] = param
-        module._parameters = new_param
-
-
-class PreBackwardFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, module, pre_backward_function, outputs):
-        ctx.module = module
-        ctx.pre_backward_function = pre_backward_function
-        if not hasattr(module, "applied_pre_backward_ref_cnt"):
-            module.applied_pre_backward_ref_cnt = 0
-        module.applied_pre_backward_ref_cnt += 1
-        #print(f"After Forward: {ctx.module.__class__.__name__}")
-        outputs = outputs.detach()
-        return outputs
-
-    @staticmethod
-    def backward(ctx, *args):
-        #print(f"Before Backward: {ctx.module.__class__.__name__}")
-        ctx.pre_backward_function(ctx.module)
-        return (None, None) + args
-
-
-class PostBackwardFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, module, pre_backward_function, output):
-        ctx.module = module
-        if output.requires_grad:
-            #TODO SOME TIMES post backward does not seem to be triggered debug in detail
-            #Should only cause increase in memory not correctness issue
-            #if output.grad_fn.__class__.__name__ == 'ViewBackward':
-            #    ctx.view=True
-            #    print(f"Warning view tensor for input to module : {module.__class__.__name__}. Backward hooks may not trigger properly")
-            #assert len(module.parameters(recurse=False)), "The input tensor to the module is a view, and autograd Function or register_hook is not triggered with view tensors."
-            #if module.ds_grads_remaining == 0:
-            #    print(f"Before Forward: {ctx.module.__class__.__name__}")
-            module.ds_grads_remaining += 1
-            ctx.pre_backward_function = pre_backward_function
-        output = output.detach()
-        return output
-
-    @staticmethod
-    def backward(ctx, *args):
-        ctx.module.ds_grads_remaining = ctx.module.ds_grads_remaining - 1
-        if ctx.module.ds_grads_remaining == 0:
-            ctx.pre_backward_function(ctx.module)
-            #print(f"After Backward: {ctx.module.__class__.__name__}")
-        return (None, None) + args
-
-
 INITIAL_MICRO_STEP_ID = -1
 
 
-class DeepSpeedZeroOptimizer_Stage3(object):
+class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
     """
     DeepSpeedZeroOptimizer designed to reduce the memory footprint
     required for training large deep learning models.
@@ -249,6 +88,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
                  max_reuse_distance=1000000000,
                  max_live_parameters=1000000000,
                  param_persistence_threshold=100000,
+                 model_persistence_threshold=sys.maxsize,
                  dp_process_group=None,
                  reduce_scatter=True,
                  overlap_comm=False,
@@ -264,14 +104,14 @@ class DeepSpeedZeroOptimizer_Stage3(object):
                  elastic_checkpoint=False,
                  aio_config=None):
 
-        see_memory_usage("Stage 3 initialize beginning", force=False)
+        see_memory_usage("Stage 3 initialize beginning", force=True)
 
         print_rank_0(f"initialized {__class__.__name__} with args: {locals()}",
                      force=False)
 
         if dist.get_rank() == 0:
             logger.info(f"Reduce bucket size {reduce_bucket_size}")
-            logger.info(f"Allgather bucket size {prefetch_bucket_size}")
+            logger.info(f"Prefetch bucket size {prefetch_bucket_size}")
         # The fused optimizer does all the work. We need this layer for two reason:
         # 1. maintain same user API from apex.fp16_utils
         # 2. keep common stuff here in case we need to add ne552w fused optimizer later
@@ -281,10 +121,10 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         # - assume all params requires grad
         # - flat by groups, not keeping state. TODO: remove state explicitly?
         # - master grad and unflat master weight never exist. TODO: a way to save out unflat master?
-        if not torch.cuda.is_available:
-            raise SystemError("Cannot use fp16 without CUDA.")
+        if not get_accelerator().is_available():
+            raise SystemError("Cannot use fp16 without accelerator.")
+
         self.optimizer = init_optimizer
-        self.using_real_optimizer = not isinstance(self.optimizer, DummyOptim)
 
         # Load pre-built or JIT compile (un)flatten ops
         util_ops = UtilsBuilder().load()
@@ -293,6 +133,9 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         self.dtype = self.optimizer.param_groups[0]['params'][0].dtype
         self._global_grad_norm = 0.
 
+        self.custom_loss_scaler = False
+        self.external_loss_scale = None
+
         self.optimizer_swapper = None
         self.swap_optimizer = False
 
@@ -304,62 +147,45 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         self.params_in_nvme_and_cpu = False
         self.max_params_in_cpu = 0
 
+        self.parameter_offload = DeepSpeedZeRoOffload(
+            module=module,
+            timers=timers,
+            ds_config=ds_config,
+            overlap_comm=overlap_comm,
+            prefetch_bucket_size=prefetch_bucket_size,
+            max_reuse_distance=max_reuse_distance,
+            max_live_parameters=max_live_parameters,
+            param_persistence_threshold=param_persistence_threshold,
+            model_persistence_threshold=model_persistence_threshold,
+            offload_param_config=offload_optimizer_config,
+            mpu=mpu)
+
+        self.persistent_parameters = self.parameter_offload.persistent_parameters
         self._configure_offloading(offload_optimizer_config, offload_param_config)
 
-        self._convert_to_zero_parameters(ds_config, module, mpu)
-
-        for m in module.modules():
-            _init_external_params(m)
-
         self.module = module
         self.elastic_checkpoint = elastic_checkpoint
 
-        # Replace ._parameters with a new class to enable auto-registration of
-        # external parameters
-        _inject_parameters(module, ZeROOrderedDict)
-
         self.__inf_or_nan_tracker: Tensor = torch.zeros(
             1,
             dtype=torch.bool,
-            device=torch.cuda.current_device(),
+            device=get_accelerator().current_device_name(),
             requires_grad=False)
 
         self.deepspeed_adam_offload = (self.offload_optimizer
                                        and type(init_optimizer) == DeepSpeedCPUAdam)
 
-        self.device = torch.cuda.current_device(
-        ) if not self.offload_optimizer else OFFLOAD_CPU_DEVICE
+        self.device = get_accelerator().current_device_name(
+        ) if not self.offload_optimizer else OffloadDeviceEnum.cpu
         ### streams used for overlapping computation with communication
-        self.__allgather_stream = Stream(
-        ) if overlap_comm else torch.cuda.default_stream()
-        self.__reduce_and_partition_stream = Stream(
-        ) if overlap_comm else torch.cuda.default_stream()
+        self.__reduce_and_partition_stream = get_accelerator().Stream(
+        ) if overlap_comm else get_accelerator().default_stream()
 
         ############################################################################
 
-        see_memory_usage("Before Partitioned Parameter Coordinator", force=False)
-        self.param_coordinators = {}
-        self._prefetch_bucket_sz = int(prefetch_bucket_size)
-        self._max_reuse_distance_in_numel = int(max_reuse_distance)
-        self._max_available_parameters_in_numel = int(max_live_parameters)
-        see_memory_usage("After Partitioned Parameter Coordinator", force=False)
-
         self.__n_caching_allocator_flushes = 0
 
         #-------------Stage 3 Setup-------------------#
-        # parameters smaller than the threshold will be collectively gathered at the
-        # end of the optimizer step and will be kept till the end of the backward pass
-        # TODO maybe worth just replicating these parameters and doing all reduce for them
-        self.persistence_threshold = int(param_persistence_threshold)
-
-        self.persistent_parameters = self.persistent_parameters()
-
-        self.setup_zero_stage3_hooks()
-
-        #resetting ds_tensor just in case parameters have been changed after initialization
-        #example .half() or .to()
-        #self.reset_ds_tensor()
-        #---------------------------------------------#
 
         self.timers = timers
 
@@ -386,9 +212,9 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         self.reduce_bucket_size = int(reduce_bucket_size)
 
         if self.reduce_scatter:
-            assert self.communication_data_type in (torch.float16, torch.bfloat16), f"ZeRO-3 supports only float16 or bfloat16 communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
-            assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with ZeRO-2 with reduce scatter enabled"
-            assert self.postscale_gradients, "pre-scale gradients is not yet supported with ZeRO-2 with reduce scatter enabled"
+            assert self.communication_data_type in (torch.float16, torch.bfloat16, torch.float32), f"ZeRO-3 supports only float16 or bfloat16 communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
+            assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with ZeRO-3 with reduce scatter enabled"
+            assert self.postscale_gradients, "pre-scale gradients is not yet supported with ZeRO-3 with reduce scatter enabled"
 
         # Holds the mode parameter
         # The param.data may not hold any meaningful data
@@ -416,6 +242,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         self.all_reduce_print = False
 
         self.prefetch_elements = int(prefetch_bucket_size)
+
         self.contiguous_gradients = contiguous_gradients
 
         # padding on each partition for alignment purposes
@@ -424,11 +251,15 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         self.sub_group_size = sub_group_size
 
         self.sub_group_to_group_id = {}
-        see_memory_usage("Before creating fp16 partitions", force=False)
-        self._create_fp16_partitions_with_defragmentation()
+
+        # Trainable parameters
+        self.trainable_param_groups = self._get_trainable_parameter_groups()
+
+        see_memory_usage("Before creating fp16 partitions", force=True)
+        self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
         num_fp16_subgroups = len(self.fp16_partitioned_groups_flat)
         see_memory_usage(f"After creating fp16 partitions: {num_fp16_subgroups}",
-                         force=False)
+                         force=True)
 
         # Optimizer tensor swapping
         if self.swap_optimizer:
@@ -437,7 +268,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         self.__params_in_ipg_bucket: List[Parameter] = []
         self.is_gradient_accumulation_boundary: bool = True
 
-        self.__param_reduce_events: Deque[Event] = collections.deque()
+        self.__param_reduce_events: Deque[get_accelerator().Event] = collections.deque()
         # TODO. make this configurable via JSON
         self.__max_param_reduce_events: int = 2
 
@@ -478,10 +309,9 @@ class DeepSpeedZeroOptimizer_Stage3(object):
             f'Largest partitioned param numel = {largest_partitioned_param_numel}',
             force=False)
 
+        self._setup_for_real_optimizer()
         self.grad_position = {}
-        if self.using_real_optimizer:
-            self._setup_for_real_optimizer()
-            self.set_grad_positions()
+        self.set_grad_positions()
 
         if self.offload_optimizer:
             self.norm_for_param_grads = {}
@@ -502,38 +332,44 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         #exit(0)
 
         # we may have a way of fusing dynamic scale. Do not support for now
-        if self.dtype == torch.float or not dynamic_loss_scale:
-            loss_scale_value = 1.0 if self.dtype == torch.float else static_loss_scale
-
-            self.dynamic_loss_scale = False
-            self.loss_scaler = LossScaler(scale=loss_scale_value)
-            cur_iter = 0
-        else:
-            if dynamic_loss_args is None:
-                self.loss_scaler = DynamicLossScaler()
-            else:
-                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
-
-            self.dynamic_loss_scale = True
+        self.loss_scaler = CreateLossScaler(dtype=self.dtype,
+                                            static_loss_scale=static_loss_scale,
+                                            dynamic_scaling=dynamic_loss_scale,
+                                            dynamic_loss_args=dynamic_loss_args)
+        self.dynamic_loss_scale = self.loss_scaler.dynamic
 
         self.debug_fp16_grads = [{} for _ in self.fp16_groups]
 
+        self._link_all_hp_params()
+
         if dist.get_rank(group=self.dp_process_group) == 0:
             see_memory_usage(f"After initializing ZeRO optimizer", force=True)
 
+    def destroy(self):
+        self.parameter_offload.destroy()
+
+    def _get_trainable_parameter_groups(self):
+        param_groups = []
+        for param_group in self.optimizer.param_groups:
+            trainable_params = {
+                "params": [p for p in param_group["params"] if p.requires_grad]
+            }
+            param_groups.append(trainable_params)
+        return param_groups
+
     def _setup_for_real_optimizer(self):
-        see_memory_usage("Before creating fp32 partitions", force=False)
+        see_memory_usage("Before creating fp32 partitions", force=True)
         self._create_fp32_partitions()
-        see_memory_usage("After creating fp32 partitions", force=False)
+        see_memory_usage("After creating fp32 partitions", force=True)
         dist.barrier()
 
         # To support pipelined optimizer swapping
         self._create_next_swappable_fp32_groups()
 
-        see_memory_usage("Before initializing optimizer states", force=False)
+        see_memory_usage("Before initializing optimizer states", force=True)
 
         self.initialize_optimizer_states()
-        see_memory_usage("After initializing optimizer states", force=False)
+        see_memory_usage("After initializing optimizer states", force=True)
         dist.barrier()
 
         if dist.get_rank() == 0:
@@ -544,18 +380,20 @@ class DeepSpeedZeroOptimizer_Stage3(object):
             self.__ipg_bucket_flat_buffer: Tensor = torch.empty(
                 self.reduce_bucket_size,
                 dtype=self.dtype,
-                device=torch.cuda.current_device())
+                device=get_accelerator().current_device_name())
 
         grad_partitions_flat_buffer = None
         self.__param_id_to_grad_partition: Dict[int, Tensor] = {}
 
         all_params = list(itertools.chain.from_iterable(self.fp16_groups))
 
-        grad_partitions_flat_buffer: Tensor = torch.zeros(
-            sum(p.ds_tensor.ds_numel for p in all_params),
-            dtype=self.dtype,
-            device=self.device,
-            pin_memory=self.offload_optimizer_pin_memory)
+        grad_partitions_flat_buffer: Tensor = torch.zeros(sum(p.partition_numel()
+                                                              for p in all_params),
+                                                          dtype=self.dtype,
+                                                          device=self.device)
+        if self.offload_optimizer_pin_memory:
+            grad_partitions_flat_buffer = get_accelerator().pin_memory(
+                grad_partitions_flat_buffer)
 
         offset = 0
         for param in all_params:
@@ -563,8 +401,21 @@ class DeepSpeedZeroOptimizer_Stage3(object):
                 param.ds_id] = grad_partitions_flat_buffer.narrow(
                     0,
                     offset,
-                    param.ds_tensor.numel())
-            offset += param.ds_tensor.numel()
+                    param.partition_numel())
+            offset += param.partition_numel()
+
+    def _link_all_hp_params(self):
+        for p in self.module.parameters():
+            p._z3_optimizer = self
+
+    def set_lr(self, lr):
+        """Set the learning rate."""
+        for param_group in self.optimizer.param_groups:
+            param_group["lr"] = lr
+
+    def get_lr(self):
+        """Return the current learning rate."""
+        return self.optimizer.param_groups[0]["lr"]
 
     # TODO. factor out to a utility outside of stage3
     @staticmethod
@@ -593,7 +444,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
             offset += tensor_numel
 
         gc.collect()
-        torch.cuda.empty_cache()
+        get_accelerator().empty_cache()
 
         # copy tensors (now flattened and contiguous) back to GPU
         device_buffer = cpu_buffer.to(orig_device)
@@ -605,79 +456,34 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         return device_buffer
 
     def _get_param_coordinator(self, training):
-        if not training in self.param_coordinators:
-            self.param_coordinators[training] = PartitionedParameterCoordinator(
-                prefetch_bucket_sz=self._prefetch_bucket_sz,
-                max_reuse_distance_in_numel=self._max_reuse_distance_in_numel,
-                max_available_parameters_in_numel=self.
-                _max_available_parameters_in_numel,
-                allgather_stream=self.__allgather_stream,
-                prefetch_nvme=self.params_in_nvme_and_cpu,
-            )
-
-        return self.param_coordinators[training]
+        return self.parameter_offload.get_param_coordinator(training)
 
     def _configure_offloading(self, offload_optimizer_config, offload_param_config):
         ###################### offload optimizer setup ##################################
-        if offload_optimizer_config is not None:
+        if offload_optimizer_config is not None and offload_optimizer_config.device != OffloadDeviceEnum.none:
             self.offload_optimizer = True
-            self.offload_optimizer_pin_memory = offload_optimizer_config[
-                OFFLOAD_OPTIMIZER_PIN_MEMORY]
-            self.swap_optimizer = offload_optimizer_config[
-                OFFLOAD_OPTIMIZER_DEVICE] == OFFLOAD_NVME_DEVICE
-            self.offload_optimizer_fast_init = offload_optimizer_config[
-                OFFLOAD_OPTIMIZER_FAST_INIT]
+            self.offload_optimizer_pin_memory = offload_optimizer_config.pin_memory
+            self.swap_optimizer = offload_optimizer_config.device == OffloadDeviceEnum.nvme
+            self.offload_optimizer_fast_init = offload_optimizer_config.fast_init
 
         ###################### offload param setup ##################################
-        if offload_param_config is not None:
-            if self.using_real_optimizer:
-                assert self.offload_optimizer, "parameter offload is only available with optimizer state offload"
+        if offload_param_config is not None and offload_param_config.device != OffloadDeviceEnum.none:
             self.offload_param = True
-            self.offload_param_pin_memory = offload_param_config[
-                OFFLOAD_PARAM_PIN_MEMORY]
-            self.params_in_nvme_and_cpu = offload_param_config[
-                OFFLOAD_PARAM_DEVICE] == OFFLOAD_NVME_DEVICE
-            self.max_params_in_cpu = offload_param_config[OFFLOAD_PARAM_MAX_IN_CPU]
+            self.offload_param_pin_memory = offload_param_config.pin_memory
+            self.params_in_nvme_and_cpu = offload_param_config.device == OffloadDeviceEnum.nvme
+            self.max_params_in_cpu = offload_param_config.max_in_cpu
             print_rank_0(
                 f"FP16 params swapping is {self.params_in_nvme_and_cpu}, Max params in CPU is {self.max_params_in_cpu}",
                 force=False)
 
-    def _convert_to_zero_parameters(self, ds_config, module, mpu):
-        non_zero_params = [p for p in module.parameters() if not is_zero_param(p)]
-        if non_zero_params:
-            zero_params = [p for p in module.parameters() if is_zero_param(p)]
-            if zero_params:
-                zero_params[0].convert_to_zero_parameters(param_list=non_zero_params)
-            else:
-                group = None
-                if mpu:
-                    group = mpu.get_data_parallel_group()
-
-                if self.params_in_nvme_and_cpu:
-                    remote_device = OFFLOAD_NVME_DEVICE
-                elif self.offload_param:
-                    remote_device = OFFLOAD_CPU_DEVICE
-                else:
-                    remote_device = None
-
-                Init(module=module,
-                     data_parallel_group=group,
-                     dtype=self.dtype,
-                     config_dict_or_path=ds_config,
-                     remote_device=remote_device,
-                     pin_memory=self.offload_param_pin_memory,
-                     mpu=mpu)
-
     def _configure_tensor_swapping(self, offload_optimizer_config, aio_config):
-        nvme_swap_folder = os.path.join(
-            offload_optimizer_config[OFFLOAD_OPTIMIZER_NVME_PATH],
-            'zero_stage_3')
+        nvme_swap_folder = os.path.join(offload_optimizer_config.nvme_path,
+                                        'zero_stage_3')
         os.makedirs(nvme_swap_folder, exist_ok=True)
-        if torch.distributed.get_rank() == 0:
+        if dist.get_rank() == 0:
             logger.info(f'Tensor Swapping: Adding optimizer tensors')
 
-        swapper_type = PipelinedOptimizerSwapper if offload_optimizer_config[
-            OFFLOAD_OPTIMIZER_PIPELINE] else PartitionedOptimizerSwapper
+        swapper_type = PipelinedOptimizerSwapper if offload_optimizer_config.pipeline else PartitionedOptimizerSwapper
 
         self.optimizer_swapper = swapper_type(
             swap_config=offload_optimizer_config,
@@ -712,7 +518,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
             '''if the parameter was initialized in nvme then bring it to the destination buffer directly'''
             if src.status == PartitionedParamStatus.NOT_AVAILABLE:
                 print_rank_0(
-                    f"Swapping in {param.ds_id} with partition size {param.ds_tensor.ds_numel} permanently to CPU"
+                    f"Swapping in {param.ds_id} with partition size {param.partition_numel()} permanently to CPU"
                 )
                 param.nvme_swapper.swap_into_buffer(param, dest)
                 src.data = dest.data
@@ -730,8 +536,8 @@ class DeepSpeedZeroOptimizer_Stage3(object):
 
         aggregate_params_count = 0
 
-        for j, param_group in enumerate(self.optimizer.param_groups):
-            params_in_group = sum([p.ds_tensor.ds_numel for p in param_group['params']])
+        for j, param_group in enumerate(self.trainable_param_groups):
+            params_in_group = sum([p.partition_numel() for p in param_group['params']])
 
             flat_buffer_size = params_in_group
 
@@ -747,9 +553,9 @@ class DeepSpeedZeroOptimizer_Stage3(object):
                 print_rank_0(f"group {j} flat buffer size {flat_buffer_size}",
                              force=False)
                 self.param_groups_fp16_flat_cpu_memory.append(
-                    torch.empty(int(flat_buffer_size),
-                                dtype=self.dtype,
-                                pin_memory=True))
+                    get_accelerator().pin_memory(
+                        torch.empty(int(flat_buffer_size),
+                                    dtype=self.dtype)))
             else:
                 print_rank_0(
                     f"No flat buffer size. Param group size was  {params_in_group}",
@@ -759,11 +565,12 @@ class DeepSpeedZeroOptimizer_Stage3(object):
                     torch.empty(1,
                                 dtype=self.dtype))
 
-    def _create_fp16_partitions_with_defragmentation(self):
+    def _create_fp16_partitions_with_defragmentation(self, fp16_param_groups):
         dist.barrier()
+
         param_groups: List[List[Parameter]] = tuple(
             self._create_fp16_sub_groups(param_group["params"])
-            for param_group in self.optimizer.param_groups)
+            for param_group in fp16_param_groups)
 
         # bookkeeping related to param groups
         for param_group_idx, param_group in enumerate(param_groups):
@@ -780,7 +587,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
 
                 # record total elements of parameter partitions in sub group
                 self.fp16_partitioned_groups_flat_numel.append(
-                    sum(p.ds_tensor.ds_numel for p in sub_group))
+                    sum(p.partition_numel() for p in sub_group))
 
                 # record padding required to align group to world size (only applies to last rank)
                 rank_requires_padding = dist.get_rank(
@@ -803,7 +610,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
             # contiguous flat buffer for all parameters that we created earlier
             offset = 0
             for sub_group in self.fp16_groups:
-                sub_group_numel = sum(param.ds_tensor.ds_numel for param in sub_group)
+                sub_group_numel = sum(param.partition_numel() for param in sub_group)
                 self.fp16_partitioned_groups_flat.append(
                     device_buffer.narrow(0,
                                          offset,
@@ -815,7 +622,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
             for param_group_idx, param_group in enumerate(param_groups):
                 flat_offset = 0
                 for i, sub_group in enumerate(param_group):
-                    total_elements = sum(p.ds_tensor.ds_numel for p in sub_group)
+                    total_elements = sum(p.partition_numel() for p in sub_group)
                     print_rank_0(f"Params in nvme and cpu {self.params_in_nvme_and_cpu}")
                     #Flat buffer may not be available for parameters that reside in NVME
                     if not self.params_in_nvme_and_cpu or flat_offset + total_elements <= self.param_groups_fp16_flat_cpu_memory[
@@ -851,7 +658,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         if should_create_fp16_flat_reuse_buffer:
             max_partition_numel, largest_partition_numel = 0, None
             for sub_group in self.fp16_groups:
-                total_elements = sum(t.ds_tensor.ds_numel for t in sub_group)
+                total_elements = sum(t.partition_numel() for t in sub_group)
                 if total_elements > max_partition_numel:
                     largest_partition_numel = [t.ds_numel for t in sub_group]
                     max_partition_numel = total_elements
@@ -869,7 +676,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
             dest = flat_buffer.narrow(0, offset, partitioned_param.ds_numel)
             if partitioned_param.status == PartitionedParamStatus.NOT_AVAILABLE:
                 print_rank_0(
-                    f"Swapping in {param.ds_id} with elements {param.ds_numel} and partition {param.ds_tensor.ds_numel}"
+                    f"Swapping in {param.ds_id} with elements {param.ds_numel} and partition {param.partition_numel()}"
                 )
                 param.nvme_swapper.swap_in([param], async_op=False)
                 dest.data.copy_(partitioned_param.data)
@@ -899,7 +706,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
             if partitioned_param.status == PartitionedParamStatus.NOT_AVAILABLE:
                 swap_path = param.nvme_swapper.get_path(param, True)
                 sub_group_partitions.append((partitioned_param,
-                                             param.ds_tensor.ds_numel,
+                                             param.partition_numel(),
                                              swap_path))
             else:
                 sub_group_partitions.append((partitioned_param,
@@ -1015,7 +822,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
 
     def _create_fp16_sub_groups(self, params_group):
 
-        params_group_numel = sum([param.partitioned_size() for param in params_group])
+        params_group_numel = sum([param.partition_numel() for param in params_group])
         sub_group_size = self.sub_group_size
 
         if sub_group_size is None or sub_group_size >= params_group_numel:
@@ -1027,7 +834,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         for param in params_group:
 
             sub_group.append(param)
-            local_sub_group_size += param.partitioned_size()
+            local_sub_group_size += param.partition_numel()
 
             if local_sub_group_size >= sub_group_size or id(param) == id(
                     params_group[-1]):
@@ -1039,219 +846,6 @@ class DeepSpeedZeroOptimizer_Stage3(object):
 
         return sub_groups
 
-    # def reset_ds_tensor(self):
-    #     for name, param in self.module.named_parameters(recurse=True):
-    #         assert hasattr(param,'ds_id'), "Parameters have not been converted to be Zero 3 compatible"
-    #         assert (param.ds_status == ZeroParamStatus.NOT_AVAILABLE), "All the parameters must have been partitioned by now"
-    #         param.ds_tensor.data = param.data
-
-    def setup_zero_stage3_hooks(self):
-        self.hierarchy = 0
-
-        #reset step if in inference mode
-        @instrument_w_nvtx
-        def _end_of_forward_hook(module, *args):
-
-            if not torch._C.is_grad_enabled():
-                self._get_param_coordinator(training=False).reset_step()
-
-        #likely one of them should be enough but just to be safe
-        self._register_hooks_recursively(self.module)
-        self.module.register_forward_hook(_end_of_forward_hook)
-
-        # Add top module to stack trace
-        global FWD_MODULE_STACK
-        FWD_MODULE_STACK.append(self.module)
-
-    def persistent_parameters(self):
-        persistent_params = []
-        total_persistent_parameters = 0
-        params_count = 0
-        for _, param in self.module.named_parameters(recurse=True):
-            if param.ds_numel < self.persistence_threshold:
-                params_count += 1
-                param.ds_persist = True
-                persistent_params.append(param)
-                total_persistent_parameters += param.ds_numel
-
-        print_rank_0(
-            f"ZeRO 3: Total persistent parameters: {total_persistent_parameters} in {params_count} params",
-            force=False)
-        return persistent_params
-
-    def _register_hooks_recursively(self, module, count=[0]):
-        my_count = count[0]
-        module.id = my_count
-
-        #print(f"{module.__class__} : {module.id}")
-
-        for child in module.children():
-            count[0] = count[0] + 1
-            self._register_hooks_recursively(child, count=count)
-
-        @instrument_w_nvtx
-        def _pre_forward_module_hook(module, *args):
-            self.pre_sub_module_forward_function(module)
-
-        @instrument_w_nvtx
-        def _post_forward_module_hook(module, input, output):
-            global FWD_MODULE_STACK
-            FWD_MODULE_STACK.pop()
-            if output is None:
-                output = []
-            elif not isinstance(output, (list, tuple)):
-                if torch.is_tensor(output):
-                    output = [output]
-                else:
-                    #print(f'got UNKNOWN type {type(output)}')
-                    outputs = []
-                    output = output if isinstance(output, dict) else vars(output)
-                    for name, val in output.items():
-                        if not name.startswith('__') and torch.is_tensor(val):
-                            outputs.append(val)
-                    output = outputs
-                    #print(f'convert output to {output}')
-
-            for item in filter(lambda item: is_zero_param(item), output):
-                if not any(id(item) in m._external_params for m in FWD_MODULE_STACK):
-                    item.is_external_param = True
-                    module_to_register = FWD_MODULE_STACK[-1]
-                    register_external_parameter(module_to_register, item)
-                    print_rank_0(
-                        f'Registering dangling parameter for module {module_to_register.__class__.__name__}, ds_id = {item.ds_id}.',
-                        force=False)
-
-                    # It's possible that the parameter was already external to the completed module. If so, remove it the
-                    # registration as it will be covered by the outer module instead.
-                    if id(item) in module._external_params:
-                        print_rank_0(
-                            f'  Unregistering nested dangling parameter from module {module.__class__.__name__}, ds_id = {item.ds_id}',
-                            force=False)
-                        unregister_external_parameter(module, item)
-
-                    item.all_gather()
-
-            self.post_sub_module_forward_function(module)
-
-        def _pre_backward_module_hook(module, inputs, output):
-            @instrument_w_nvtx
-            def _run_before_backward_function(sub_module):
-                # some models (e.g. Albert) may run multiple forwards on the same layer in a loop
-                # before doing backwards, so each backward will need a pre-fetch - using reference
-                # counting to support this scenario
-                #print(f"COUNTER before: {sub_module.applied_pre_backward_ref_cnt}")
-                if sub_module.applied_pre_backward_ref_cnt > 0:
-                    self.pre_sub_module_backward_function(sub_module)
-                    sub_module.applied_pre_backward_ref_cnt -= 1
-                #print(f"COUNTER after: {sub_module.applied_pre_backward_ref_cnt}")
-
-            return _apply_to_tensors_only(module,
-                                          PreBackwardFunction,
-                                          _run_before_backward_function,
-                                          output)
-
-        #This is an alternate to doing _post_backward_module_hook
-        #it uses tensor.register_hook instead of using torch.autograd.Function
-        def _alternate_post_backward_module_hook(module, inputs):
-            module.ds_grads_remaining = 0
-
-            #print(f"Before Forward {module.__class__.__name__}")
-
-            def _run_after_backward_hook(*unused):
-                module.ds_grads_remaining = module.ds_grads_remaining - 1
-                if module.ds_grads_remaining == 0:
-                    #print(f"After backward {module.__class__.__name__}")
-                    self.post_sub_module_backward_function(module)
-
-            def _run_before_forward_function(input):
-                if input.requires_grad:
-                    module.ds_grads_remaining += 1
-
-            return _apply_forward_and_backward_to_tensors_only(
-                module,
-                _run_before_forward_function,
-                _run_after_backward_hook,
-                inputs)
-
-        def _post_backward_module_hook(module, inputs):
-            module.ds_grads_remaining = 0
-
-            @instrument_w_nvtx
-            def _run_after_backward_function(sub_module):
-                if sub_module.ds_grads_remaining == 0:
-                    self.post_sub_module_backward_function(sub_module)
-
-            return _apply_to_tensors_only(module,
-                                          PostBackwardFunction,
-                                          _run_after_backward_function,
-                                          inputs)
-
-        # Pre forward hook
-        module.register_forward_pre_hook(_pre_forward_module_hook)
-        # Post forward hook
-        module.register_forward_hook(_post_forward_module_hook)
-
-        # Pre backward hook
-        module.register_forward_hook(_pre_backward_module_hook)
-
-        # post backward hook
-        module.register_forward_pre_hook(_post_backward_module_hook)
-
-    @torch.no_grad()
-    def pre_sub_module_forward_function(self, sub_module):
-        see_memory_usage(f"Before sub module function {sub_module.__class__.__name__}",
-                         force=False)
-
-        global FWD_MODULE_STACK
-        FWD_MODULE_STACK.append(sub_module)
-
-        param_coordinator = self._get_param_coordinator(training=sub_module.training)
-        param_coordinator.trace_prologue(sub_module)
-        if param_coordinator.is_record_trace():
-            param_coordinator.record_module(sub_module)
-        param_coordinator.fetch_sub_module(sub_module)
-
-        see_memory_usage(
-            f"Before sub module function {sub_module.__class__.__name__} after fetch",
-            force=False)
-
-    @torch.no_grad()
-    def post_sub_module_forward_function(self, sub_module):
-        see_memory_usage(
-            f"After sub module function {sub_module.__class__.__name__} {sub_module.id} before release",
-            force=False)
-
-        param_coordinator = self._get_param_coordinator(training=sub_module.training)
-        if param_coordinator.is_record_trace():
-            param_coordinator.record_parameters(sub_module)
-        param_coordinator.release_sub_module(sub_module)
-
-        see_memory_usage(
-            f"After sub module function {sub_module.__class__.__name__}  {sub_module.id} after release",
-            force=False)
-
-    @torch.no_grad()
-    def pre_sub_module_backward_function(self, sub_module):
-        param_coordinator = self._get_param_coordinator(training=sub_module.training)
-        param_coordinator.trace_prologue(sub_module)
-        if param_coordinator.is_record_trace():
-            param_coordinator.record_module(sub_module)
-            param_coordinator.record_parameters(sub_module)
-        param_coordinator.fetch_sub_module(sub_module)
-
-    @torch.no_grad()
-    def post_sub_module_backward_function(self, sub_module):
-        see_memory_usage(
-            f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} before release",
-            force=False)
-
-        self._get_param_coordinator(
-            training=sub_module.training).release_sub_module(sub_module)
-
-        see_memory_usage(
-            f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} after release",
-            force=False)
-
     def _release_ipg_buffers(self):
         if self.contiguous_gradients:
             self.ipg_buffer = None
@@ -1304,7 +898,6 @@ class DeepSpeedZeroOptimizer_Stage3(object):
                                       dtype=gradient_dtype,
                                       device=self.device)
 
-        timers = self.timers
         timer_names = set()
 
         if self.swap_optimizer:
@@ -1332,7 +925,8 @@ class DeepSpeedZeroOptimizer_Stage3(object):
                                                        dtype=gradient_dtype,
                                                        device=self.device)
                 if self.offload_optimizer_pin_memory:
-                    subgroup_gradient_buffer = subgroup_gradient_buffer.pin_memory()
+                    subgroup_gradient_buffer = get_accelerator().pin_memory(
+                        subgroup_gradient_buffer)
 
                 self.fp32_partitioned_groups_flat[i].grad = subgroup_gradient_buffer
             else:
@@ -1509,19 +1103,20 @@ class DeepSpeedZeroOptimizer_Stage3(object):
     @instrument_w_nvtx
     @torch.no_grad()
     def __add_grad_to_ipg_bucket(self, param: Parameter) -> None:
-        self.__reduce_and_partition_stream.wait_stream(torch.cuda.default_stream())
+        self.__reduce_and_partition_stream.wait_stream(
+            get_accelerator().default_stream())
 
         if self.contiguous_gradients and self.elements_in_ipg_bucket + param.grad.numel(
         ) < self.reduce_bucket_size:
             # move the gradient to a contiguous buffer
-            with torch.cuda.stream(self.__reduce_and_partition_stream):
+            with get_accelerator().stream(self.__reduce_and_partition_stream):
                 # move the parameter's gradient to the contiguous flat buffer
                 new_grad_tensor = self.__ipg_bucket_flat_buffer.narrow(
                     0,
                     self.elements_in_ipg_bucket,
                     param.grad.numel()).view_as(param.grad)
                 new_grad_tensor.copy_(param.grad, non_blocking=True)
-                param.grad.record_stream(torch.cuda.current_stream())
+                param.grad.record_stream(get_accelerator().current_stream())
                 param.grad.data = new_grad_tensor
 
         self.__params_in_ipg_bucket.append(param)
@@ -1548,7 +1143,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         if len(self.__param_reduce_events) > self.__max_param_reduce_events:
             self.__param_reduce_events.popleft().synchronize()
 
-        with torch.cuda.stream(self.__reduce_and_partition_stream):
+        with get_accelerator().stream(self.__reduce_and_partition_stream):
             if safe_mode:
                 assert_ints_same_as_other_ranks(
                     [p.ds_id for p in self.__params_in_ipg_bucket])
@@ -1558,18 +1153,19 @@ class DeepSpeedZeroOptimizer_Stage3(object):
 
             self.__params_in_ipg_bucket.clear()
 
-            event = Event()
+            event = get_accelerator().Event()
             event.record()
             self.__param_reduce_events.append(event)
 
     @instrument_w_nvtx
     def __avg_scatter_grads(self, params_to_reduce: List[Parameter]) -> List[Tensor]:
         """average gradients and scatter partitions across ranks"""
-        dtype = get_only_unique_item(p.grad.dtype for p in params_to_reduce)
 
         full_grads_for_rank = [p.grad for p in params_to_reduce]
-        if self.communication_data_type == torch.float32:
-            full_grads_for_rank = [g.float() for g in full_grads_for_rank]
+        if self.communication_data_type != self.dtype:
+            full_grads_for_rank = [
+                g.to(self.communication_data_type) for g in full_grads_for_rank
+            ]
 
         if self.postscale_gradients and self.gradient_predivide_factor != 1.0:
             full_grads_for_rank = [
@@ -1585,8 +1181,10 @@ class DeepSpeedZeroOptimizer_Stage3(object):
                 g.mul(self.gradient_predivide_factor) for g in grad_partitions_for_rank
             ]
 
-        if self.communication_data_type == torch.float32:
-            grad_partitions_for_rank = [g.to(dtype) for g in grad_partitions_for_rank]
+        if self.communication_data_type != self.dtype:
+            grad_partitions_for_rank = [
+                g.to(self.dtype) for g in grad_partitions_for_rank
+            ]
 
         return grad_partitions_for_rank
 
@@ -1595,7 +1193,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
             current_offset = 0
             for param in group:
                 param_id = self.get_param_id(param)
-                num_elements = param.ds_tensor.ds_numel
+                num_elements = param.partition_numel()
 
                 self.grad_position[param_id] = [
                     int(i),
@@ -1622,7 +1220,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         self.norm_for_param_grads[param_id] = self._constant_buffered_norm2(param.grad)
 
     def async_inplace_copy_grad_to_fp32_buffer_from_gpu(self, param, fp32_grad_tensor):
-        with torch.cuda.stream(self.copy_grad_stream):
+        with get_accelerator().stream(self.copy_grad_stream):
             param_id = self.get_param_id(param)
             src_tensor = param.grad.view(-1).float()
             #print(f"src_tensor {src_tensor.size()} and fp32 grad {fp32_grad_tensor.size()}")
@@ -1640,14 +1238,13 @@ class DeepSpeedZeroOptimizer_Stage3(object):
                     total_norm += param_norm.item()**2
 
         # Sum across all model parallel GPUs.
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
 
-        torch.distributed.all_reduce(total_norm_cuda,
-                                     op=torch.distributed.ReduceOp.SUM,
-                                     group=self.dp_process_group)
+        dist.all_reduce(total_norm_cuda,
+                        op=dist.ReduceOp.SUM,
+                        group=self.dp_process_group)
 
-        self._model_parallel_all_reduce(tensor=total_norm_cuda,
-                                        op=torch.distributed.ReduceOp.SUM)
+        self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
 
         total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
@@ -1661,10 +1258,15 @@ class DeepSpeedZeroOptimizer_Stage3(object):
     def __partition_grads(self,
                           params_to_release: List[Parameter],
                           grad_partitions: List[Tensor]) -> None:
+        offload_fp32_gradients = {}
+        offload_fp32_offsets = {}
         for param, grad_partition in zip(params_to_release, grad_partitions):
-            if param.ds_tensor.ds_numel * dist.get_rank(
-                    self.dp_process_group) > param.ds_numel:
+
+            contains_real_data = param.partition_numel() * dist.get_rank(
+                self.dp_process_group) < param.ds_numel
+            if not contains_real_data:
                 # this grad partition is empty - don't need to do anything
+                param.grad = None
                 continue
 
             # move or accumulate gradient partition to target buffer
@@ -1677,7 +1279,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
                 # ensure grad buffer is a CUDA buffer to speed up the next few
                 # operations and so it can be used asynchronously
                 grad_buffer = grad_buffer.to(grad_partition.device, non_blocking=True)
-            elif grad_buffer.is_cuda:
+            elif get_accelerator().on_accelerator(grad_buffer):
                 grad_buffer.add_(grad_partition)
             else:
                 # if dst is CPU, copy first to src device, do the addition
@@ -1702,8 +1304,6 @@ class DeepSpeedZeroOptimizer_Stage3(object):
             # offload the gradient partition if applicable
             if self.offload_optimizer:
                 i, dest_offset, _ = self.grad_position[self.get_param_id(param)]
-                offload_fp32_gradients = {}
-                offload_fp32_offsets = {}
 
                 if self.is_gradient_accumulation_boundary:
                     self.norm_for_param_grads[self.get_param_id(
@@ -1724,7 +1324,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
                         fp32_grad_tensor.copy_(grad_buffer)
 
             # free the gradient
-            param.grad.record_stream(torch.cuda.current_stream())
+            param.grad.record_stream(get_accelerator().current_stream())
             param.grad = None
 
         if self.offload_optimizer and self.swap_optimizer:
@@ -1806,11 +1406,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
 
     ######################Reduction Related Methods##############################
 
-    def allreduce_bucket(self,
-                         bucket,
-                         communication_data_type=torch.float16,
-                         rank=None,
-                         log=None):
+    def allreduce_bucket(self, bucket, rank=None, log=None):
         rank = None
         tensor = self.flatten(bucket)
 
@@ -1818,6 +1414,8 @@ class DeepSpeedZeroOptimizer_Stage3(object):
 
         if pg_correctness_test:
             communication_data_type = torch.float32
+        else:
+            communication_data_type = self.communication_data_type
 
         if communication_data_type != tensor.dtype:
             tensor_to_allreduce = tensor.to(communication_data_type)
@@ -1828,7 +1426,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
             #    "All Reducing"
             dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
         else:
-            global_rank = _get_global_rank(self.dp_process_group, rank)
+            global_rank = dist.get_global_rank(self.dp_process_group, rank)
             dist.reduce(tensor_to_allreduce, global_rank, group=self.dp_process_group)
 
         if communication_data_type != tensor.dtype and tensor is not tensor_to_allreduce:
@@ -1839,7 +1437,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
 
     # if rank is specified do a reduction instead of an allreduce
     def allreduce_and_copy(self, small_bucket, rank=None, log=None):
-        with torch.cuda.stream(self.reduction_stream):
+        with get_accelerator().stream(self.reduction_stream):
             allreduced = self.allreduce_bucket(small_bucket, rank=rank, log=log)
             if rank is None or rank == dist.get_rank(group=self.dp_process_group):
                 for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)):
@@ -1919,7 +1517,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         return params_in_partition, params_not_in_partition, first_offset
 
     @instrument_w_nvtx
-    def zero_grad(self, set_grads_to_None=True):
+    def zero_grad(self, set_to_none=False):
         """
         Zero FP16 parameter grads.
         """
@@ -1929,9 +1527,9 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         # For speed, set model fp16 grad to None by default
         for group in self.fp16_groups:
             for p in group:
-                if set_grads_to_None:
-                    if p.grad is not None and p.grad.is_cuda:
-                        p.grad.record_stream(torch.cuda.current_stream())
+                if set_to_none:
+                    if p.grad is not None and get_accelerator().on_accelerator(p.grad):
+                        p.grad.record_stream(get_accelerator().current_stream())
                     p.grad = None
                 else:
                     if p.grad is not None:
@@ -1944,9 +1542,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         if self.model_parallel_group is None:
             pass
         else:
-            torch.distributed.all_reduce(tensor=tensor,
-                                         op=op,
-                                         group=self.model_parallel_group)
+            dist.all_reduce(tensor=tensor, op=op, group=self.model_parallel_group)
 
     @instrument_w_nvtx
     def get_grad_norm_direct(self, gradients, params, norm_type=2):
@@ -1969,14 +1565,13 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         norm_type = float(norm_type)
         if norm_type == inf:
             total_norm = max(g.data.abs().max() for g in gradients)
-            total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=self.dp_process_group)
+            total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+            dist.all_reduce(total_norm_cuda,
+                            op=dist.ReduceOp.MAX,
+                            group=self.dp_process_group)
 
             # Take max across all GPUs.
-            self._model_parallel_all_reduce(tensor=total_norm_cuda,
-                                            op=torch.distributed.ReduceOp.MAX)
+            self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.MAX)
             total_norm = total_norm_cuda[0].item()
         else:
             # if dist.get_rank() == 0:
@@ -1984,17 +1579,18 @@ class DeepSpeedZeroOptimizer_Stage3(object):
             grad_norms = []
             for g, p in zip(gradients, params):
                 if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
-                    grad_norms.append(g.cuda(non_blocking=True).double().norm(2))
+                    grad_norms.append(
+                        g.to(get_accelerator().device_name(),
+                             non_blocking=True).double().norm(2))
 
             # Sum across all model parallel GPUs.
             total_norm_cuda = torch.sum(torch.pow(torch.stack(grad_norms), 2))
 
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.SUM,
-                                         group=self.dp_process_group)
+            dist.all_reduce(total_norm_cuda,
+                            op=dist.ReduceOp.SUM,
+                            group=self.dp_process_group)
 
-            self._model_parallel_all_reduce(tensor=total_norm_cuda,
-                                            op=torch.distributed.ReduceOp.SUM)
+            self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
 
             total_norm = total_norm_cuda.item()**(1. / norm_type)
 
@@ -2122,10 +1718,11 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         self.fp32_partitioned_groups_flat[sub_group_id].grad = single_grad_partition
 
         # release all the gradient since we have already created a necessary copy in dp_grad_partition
-        self.zero_grad()
+        self.zero_grad(set_to_none=True)
 
-        for grad in filter(lambda g: g.is_cuda, self.averaged_gradients[sub_group_id]):
-            grad.record_stream(torch.cuda.current_stream())
+        for grad in filter(lambda g: get_accelerator().on_accelerator(g),
+                           self.averaged_gradients[sub_group_id]):
+            grad.record_stream(get_accelerator().current_stream())
 
         self.averaged_gradients[sub_group_id] = None
 
@@ -2230,7 +1827,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
 
     def _overflow_clean_up(self, prev_scale):
         see_memory_usage('After overflow before clearing gradients', force=False)
-        self.zero_grad()
+        self.zero_grad(set_to_none=True)
 
         if self.offload_optimizer:
             self.reset_cpu_buffers()
@@ -2239,12 +1836,11 @@ class DeepSpeedZeroOptimizer_Stage3(object):
 
         see_memory_usage('After overflow after clearing gradients', force=False)
 
-        if torch.distributed.get_rank() == 0:
-            logger.info(
-                "[deepspeed] OVERFLOW! Rank {} Skipping step. Attempted loss scale: {}, "
-                "reducing to {}".format(dist.get_rank(),
-                                        prev_scale,
-                                        self.loss_scale))
+        if dist.get_rank() == 0:
+            overflow_msg = f"[deepspeed] OVERFLOW! Rank {dist.get_rank()} Skipping step."
+            if self.dtype == torch.half:
+                overflow_msg += f" Attempted loss scale: {prev_scale}, reducing to {self.loss_scale}"
+            logger.info(overflow_msg)
 
     @instrument_w_nvtx
     def _overflow_check_and_loss_scale_update(self):
@@ -2289,6 +1885,14 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         else:
             self._partitioned_params_swap_out(sub_group_id)
 
+    def override_loss_scale(self, loss_scale):
+        if loss_scale != self.external_loss_scale:
+            logger.info(
+                f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}'
+            )
+        self.custom_loss_scaler = True
+        self.external_loss_scale = loss_scale
+
     @instrument_w_nvtx
     def step(self, closure=None):
         """
@@ -2337,9 +1941,8 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         self._post_step(timer_names)
 
         # warn user about caching allocator flushes
-        alloc_retries = torch.cuda.memory_stats()["num_alloc_retries"] if hasattr(
-            torch.cuda,
-            "memory_stats") else 0
+        memory_stats = get_accelerator().memory_stats()
+        alloc_retries = memory_stats["num_alloc_retries"] if memory_stats != None else 0
         if alloc_retries > self.__n_caching_allocator_flushes:
             if dist.get_rank() == 0:
                 logger.warning(
@@ -2348,7 +1951,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
                     "performance. if this is happening frequently consider adjusting "
                     "settings to reduce memory consumption. If you are unable to "
                     "make the cache flushes go away consider adding "
-                    "torch.cuda.empty_cache() calls in your training loop to ensure "
+                    "get_accelerator().empty_cache() calls in your training loop to ensure "
                     "that all ranks flush their caches at the same time",
                     alloc_retries - self.__n_caching_allocator_flushes)
             self.__n_caching_allocator_flushes = alloc_retries
@@ -2419,16 +2022,16 @@ class DeepSpeedZeroOptimizer_Stage3(object):
     @instrument_w_nvtx
     def has_overflow(self, partition_gradients=True):
         if partition_gradients:
-            with torch.cuda.stream(self.__reduce_and_partition_stream):
+            with get_accelerator().stream(self.__reduce_and_partition_stream):
                 self.local_overflow = bool(self.__inf_or_nan_tracker.item())
                 self.__inf_or_nan_tracker.zero_()
 
             overflow = self.local_overflow
             #overflow = self.has_overflow_partitioned_grads_serial()
-            overflow_gpu = torch.cuda.ByteTensor([overflow])
-            torch.distributed.all_reduce(overflow_gpu,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=self.dp_process_group)
+            overflow_gpu = get_accelerator().ByteTensor([overflow])
+            dist.all_reduce(overflow_gpu,
+                            op=dist.ReduceOp.MAX,
+                            group=self.dp_process_group)
 
         else:
             params = []
@@ -2437,12 +2040,11 @@ class DeepSpeedZeroOptimizer_Stage3(object):
                     params.append(param)
 
             overflow = self.has_overflow_serial(params, is_grad_list=partition_gradients)
-            overflow_gpu = torch.cuda.ByteTensor([overflow])
+            overflow_gpu = get_accelerator().ByteTensor([overflow])
 
         # Since each model parallel GPU carries only part of the model,
         # make sure overflow flag is synced across all the model parallel GPUs
-        self._model_parallel_all_reduce(tensor=overflow_gpu,
-                                        op=torch.distributed.ReduceOp.MAX)
+        self._model_parallel_all_reduce(tensor=overflow_gpu, op=dist.ReduceOp.MAX)
 
         overflow = overflow_gpu[0].item()
         return bool(overflow)
@@ -2483,7 +2085,11 @@ class DeepSpeedZeroOptimizer_Stage3(object):
 
         see_memory_usage(f"Before backward", force=False)
 
-        self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
+        if self.custom_loss_scaler:
+            scaled_loss = self.external_loss_scale * loss
+            scaled_loss.backward()
+        else:
+            self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
 
         self._get_param_coordinator(training=True).reset_step()
 
@@ -2512,16 +2118,68 @@ class DeepSpeedZeroOptimizer_Stage3(object):
 
         return grad_dict
 
+    def _fp32_state_allgather(self, param, fp32_state):
+        reduce_buffer = torch.zeros(self.partition_count * fp32_state.numel(),
+                                    dtype=torch.float32,
+                                    device=param.device).flatten()
+        my_rank = dist.get_rank(group=self.dp_process_group)
+        partitions = [
+            reduce_buffer.narrow(0,
+                                 fp32_state.numel() * i,
+                                 fp32_state.numel()) for i in range(self.partition_count)
+        ]
+        partitions[my_rank].data.copy_(fp32_state.data, non_blocking=False)
+
+        dist.all_gather(partitions, partitions[my_rank], group=self.dp_process_group)
+
+        return reduce_buffer.narrow(0, 0, param.ds_numel).view(param.ds_shape)
+
+    def get_fp32_grad_for_param(self, param) -> Tensor:
+        if not param.requires_grad:
+            return None
+
+        self.__reduce_and_partition_stream.synchronize()
+
+        if self.offload_optimizer:
+            group_idx, dest_offset, num_elements = self.grad_position[self.get_param_id(param)]
+            fp32_grad = self.fp32_partitioned_groups_flat[group_idx].grad.narrow(
+                0,
+                dest_offset,
+                num_elements).to(device=param.device)
+        else:
+            fp32_grad = self.__param_id_to_grad_partition[param.ds_id].float()
+
+        return self._fp32_state_allgather(param, fp32_grad)
+
+    def get_full_hp_param(self, param, optim_state_key=None) -> Tensor:
+        if not param.requires_grad:
+            return None
+
+        self.__reduce_and_partition_stream.synchronize()
+        group_idx, dest_offset, num_elements = self.grad_position[self.get_param_id(param)]
+
+        if self._swappable_optimizer_subgroup(group_idx):
+            self._optimizer_states_and_gradient_swap_in(group_idx)
+
+        fp32_param = self.fp32_partitioned_groups_flat[group_idx]
+        if optim_state_key is None:
+            fp32_opt_state = fp32_param.narrow(0,
+                                               dest_offset,
+                                               num_elements).to(device=param.device)
+        else:
+            fp32_opt_state = self.optimizer.state[fp32_param][optim_state_key].narrow(
+                0,
+                dest_offset,
+                num_elements).to(device=param.device)
+
+        hp_param = self._fp32_state_allgather(param, fp32_opt_state)
+        if self._swappable_optimizer_subgroup(group_idx):
+            self._optimizer_states_and_gradient_swap_out(group_idx)
+        return hp_param
+
     @instrument_w_nvtx
     def _partition_all_parameters(self):
-        """Partitioning Parameters that were not partitioned usually if parameters
-        of modules whose input parameters do not require grad computation do not
-        trigger post call and will therefore will remain unpartitioned"""
-        self._get_param_coordinator(training=self.module.training).release_and_reset_all(
-            self.module)
-        for param in iter_params(self.module, recurse=True):
-            if param.ds_status != ZeroParamStatus.NOT_AVAILABLE:
-                raise RuntimeError(f"{param.ds_summary()} expected to be released")
+        self.parameter_offload.partition_all_parameters()
 
     def check_overflow(self, partition_gradients=True):
         self._check_overflow(partition_gradients)
@@ -2545,12 +2203,16 @@ class DeepSpeedZeroOptimizer_Stage3(object):
 
     def _set_param_groups(self, value):
         self.optimizer.param_groups = value
+        self.trainable_param_groups = self._get_trainable_parameter_groups()
 
     param_groups = property(_get_param_groups, _set_param_groups)
 
     # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
     def _get_loss_scale(self):
-        return self.loss_scaler.loss_scale
+        if self.custom_loss_scaler:
+            return self.external_loss_scale
+        else:
+            return self.loss_scaler.cur_scale
 
     def _set_loss_scale(self, value):
         self.loss_scaler.cur_scale = value
@@ -2613,7 +2275,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
 
     def _rigid_state_dict(self):
         state_dict = {}
-        state_dict[ZERO_STAGE] = ZERO_OPTIMIZATION_WEIGHTS
+        state_dict[ZERO_STAGE] = ZeroStageEnum.weights
         state_dict['loss_scaler'] = self.loss_scaler
         state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
         state_dict['overflow'] = self.overflow
@@ -2759,7 +2421,8 @@ class DeepSpeedZeroOptimizer_Stage3(object):
     def load_state_dict(self,
                         state_dict_list,
                         load_optimizer_states=True,
-                        load_from_fp32_weights=False):
+                        load_from_fp32_weights=False,
+                        checkpoint_folder=None):
         r"""Loading a ZeRO checkpoint
         Arguments:
             state_dict_list: List of all saved ZeRO checkpoints, one for each saved partition.
@@ -2776,7 +2439,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
         will call ``model.load_state_dict()`` before
         ``fp16_optimizer_instance.load_state_dict()`` is called.
         Example::
-            model = torch.nn.Linear(D_in, D_out).cuda().half()
+            model = torch.nn.Linear(D_in, D_out).to(get_accelerator().device_name()).half()
             optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
             optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
             ...
@@ -2813,7 +2476,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
 
 def _handle_overflow(cpu_sum, x, i):
     import math
-    rank = torch.distributed.get_rank()
+    rank = dist.get_rank()
     if rank == 0:
         t_i = -1
         for v_i, v in enumerate(x.data.contiguous().view(-1)):
@@ -2881,9 +2544,6 @@ def model_to_params(model):
     return total_params, largest_layer_params
 
 
-import math
-
-
 def estimate_zero3_model_states_mem_needs_all_live(model,
                                                    num_gpus_per_node=1,
                                                    num_nodes=1,
@@ -2941,11 +2601,11 @@ def estimate_zero3_model_states_mem_needs_all_cold(total_params,
     """
     def format_options(cpu_offload, cpu_offload_params, zero_init):
         enabled = []
-        padded_cpu_str = f'{OFFLOAD_CPU_DEVICE:4}'
+        padded_cpu_str = f'{OffloadDeviceEnum.cpu:4}'
         param_device = padded_cpu_str if cpu_offload_params else "none"
-        enabled.append(f"{OFFLOAD_PARAM}={param_device}")
+        enabled.append(f"offload_param={param_device}")
         optimizer_device = padded_cpu_str if cpu_offload else "none"
-        enabled.append(f"{OFFLOAD_OPTIMIZER}={optimizer_device}")
+        enabled.append(f"offload_optimizer={optimizer_device}")
         enabled.append(f"zero_init={1 if zero_init else 0}")
         return ", ".join(enabled)
 
diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
old mode 100644
new mode 100755
index ab3eec32657fd8556d4d029a023d3890a8df92af..aef25135320c7402f9f82bcb85508facb396fc97
--- a/deepspeed/runtime/zero/stage_1_and_2.py
+++ b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -3,33 +3,43 @@ Copyright 2019 The Microsoft DeepSpeed Team
 '''
 
 import torch
-from torch.distributed.distributed_c10d import _get_global_rank
-import torch.distributed as dist
-from torch._six import inf
+import os
+from deepspeed import comm as dist
 from packaging import version as pkg_version
+from collections import OrderedDict
 
-from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler
+from deepspeed.runtime import ZeROOptimizer
+from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler
 from deepspeed.runtime.utils import (bwc_tensor_model_parallel_rank,
                                      get_global_norm,
+                                     empty_cache,
                                      see_memory_usage,
+                                     inf,
                                      is_model_parallel_parameter,
                                      align_dense_tensors,
                                      all_gather_dp_groups)
 
-from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_GRADIENTS
-from deepspeed.runtime.zero.offload_constants import OFFLOAD_CPU_DEVICE, OFFLOAD_OPTIMIZER
+from deepspeed.runtime.zero.config import ZeroStageEnum
+from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
 from deepspeed.ops.adam import DeepSpeedCPUAdam
-from deepspeed.ops.op_builder import UtilsBuilder
 from deepspeed.utils import logger
 from deepspeed.moe.utils import is_moe_param
 from deepspeed.git_version_info import version
+
 from deepspeed.runtime.constants import PIPE_REPLICATED
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import UtilsBuilder
+
 from deepspeed.checkpoint.constants import (DS_VERSION,
+                                            GROUP_PADDINGS,
                                             PARTITION_COUNT,
                                             SINGLE_PARTITION_OF_FP32_GROUPS,
                                             BASE_OPTIMIZER_STATE,
                                             CLIP_GRAD,
-                                            ZERO_STAGE)
+                                            ZERO_STAGE,
+                                            PARAM_SLICE_MAPPINGS)
+from deepspeed.utils import link_hp_params
+from deepspeed.checkpoint import enable_universal_checkpoint
 
 # Toggle this to true to enable correctness test
 # with gradient partitioning and without
@@ -41,11 +51,12 @@ def input(msg):
 
 
 def split_half_float_double(tensors):
+    device_type = get_accelerator().device_name()
     dtypes = [
-        "torch.cuda.HalfTensor",
-        "torch.cuda.FloatTensor",
-        "torch.cuda.DoubleTensor",
-        "torch.cuda.BFloat16Tensor"
+        "torch.{}.HalfTensor".format(device_type),
+        "torch.{}.FloatTensor".format(device_type),
+        "torch.{}.DoubleTensor".format(device_type),
+        "torch.{}.BFloat16Tensor".format(device_type)
     ]
     buckets = []
     for i, dtype in enumerate(dtypes):
@@ -88,7 +99,7 @@ def _get_padded_tensor(src_tensor, size):
     return padded_tensor
 
 
-class DeepSpeedZeroOptimizer(object):
+class DeepSpeedZeroOptimizer(ZeROOptimizer):
     """
     DeepSpeedZeroOptimizer designed to reduce the memory footprint
     required for training large deep learning models.
@@ -101,6 +112,7 @@ class DeepSpeedZeroOptimizer(object):
     """
     def __init__(self,
                  init_optimizer,
+                 param_names,
                  timers,
                  static_loss_scale=1.0,
                  dynamic_loss_scale=False,
@@ -138,14 +150,15 @@ class DeepSpeedZeroOptimizer(object):
         # 2. keep common stuff here in case we need to add ne552w fused optimizer later
 
         self.elastic_checkpoint = elastic_checkpoint
-
+        self.param_names = param_names
+        self.mpu = mpu
         # differences from apex.fp16_utils:
         # - assume all model params in fp16
         # - assume all params requires grad
         # - flat by groups, not keeping state. TODO: remove state explicitly?
         # - master grad and unflat master weight never exist. TODO: a way to save out unflat master?
-        if not torch.cuda.is_available:
-            raise SystemError("Cannot use fp16 without CUDA.")
+        if not get_accelerator().is_available():
+            raise SystemError("Cannot use fp16 without accelerator.")
         self.optimizer = init_optimizer
 
         # Load pre-built or JIT compile (un)flatten ops
@@ -166,7 +179,8 @@ class DeepSpeedZeroOptimizer(object):
 
         self.deepspeed_adam_offload = cpu_offload
 
-        self.device = torch.cuda.current_device() if not self.cpu_offload else 'cpu'
+        self.device = get_accelerator().current_device_name(
+        ) if not self.cpu_offload else 'cpu'
 
         self.dp_process_group = dp_process_group
 
@@ -198,9 +212,11 @@ class DeepSpeedZeroOptimizer(object):
 
         if mpu is None:
             self.model_parallel_group = None
+            self.model_parallel_world_size = 1
             self.model_parallel_rank = 0
         else:
             self.model_parallel_group = mpu.get_model_parallel_group()
+            self.model_parallel_world_size = mpu.get_model_parallel_world_size()
             self.model_parallel_rank = bwc_tensor_model_parallel_rank(mpu)
 
         self.overflow = False
@@ -253,7 +269,7 @@ class DeepSpeedZeroOptimizer(object):
         # number of elements per partition in each group
         self.partition_size = []
 
-        #align nccl all-gather send buffers to 4-bye boundary
+        # align nccl all-gather send buffers to 4-byte boundary
         self.nccl_start_alignment_factor = 2  # 4-byte alignment/sizeof(fp16) = 2
 
         assert (allgather_bucket_size % self.nccl_start_alignment_factor == 0), f"allgather_bucket_size must be a multiple of nccl_start_alignment_factor, {self.nccl_start_alignment_factor} "
@@ -278,21 +294,13 @@ class DeepSpeedZeroOptimizer(object):
             ]
             self.bit16_groups.append(trainable_parameters)
 
-            # Record padding required to align group to world size
-            if partition_id == dist.get_world_size(
-                    group=self.real_dp_process_group[i]) - 1:
-                padding = get_alignment_padding(self.bit16_groups[i],
-                                                self.partition_count[i])
-            else:
-                padding = 0
-            self.groups_padding.append(padding)
-
             # not sure why apex was cloning the weights before flattening
             # removing cloning here
 
             see_memory_usage(f"Before moving param group {i} to CPU")
             # move all the parameters to cpu to free up GPU space for creating flat buffer
             move_to_cpu(self.bit16_groups[i])
+            empty_cache()
             see_memory_usage(f"After moving param group {i} to CPU", force=False)
 
             # Reorder group parameters for load balancing of gradient partitioning during backward among ranks.
@@ -316,11 +324,20 @@ class DeepSpeedZeroOptimizer(object):
                 self.flatten_dense_tensors_aligned(
                     self.round_robin_bit16_groups[i],
                     self.nccl_start_alignment_factor *
-                    dist.get_world_size(group=self.real_dp_process_group[i])).cuda(
-                        torch.cuda.current_device()))
+                    dist.get_world_size(group=self.real_dp_process_group[i])).to(
+                        get_accelerator().current_device_name()))
             see_memory_usage(f"After flattening and moving param group {i} to GPU",
                              force=False)
 
+            # Record padding required for alignment
+            if partition_id == dist.get_world_size(
+                    group=self.real_dp_process_group[i]) - 1:
+                padding = self.bit16_groups_flat[i].numel() - sum(
+                    [t.numel() for t in self.round_robin_bit16_groups[i]])
+            else:
+                padding = 0
+            self.groups_padding.append(padding)
+
             if dist.get_rank(group=self.real_dp_process_group[i]) == 0:
                 see_memory_usage(
                     f"After Flattening and after emptying param group {i} cache",
@@ -341,12 +358,9 @@ class DeepSpeedZeroOptimizer(object):
                 assert (partitioned_data.data_ptr() %
                         (2 * self.nccl_start_alignment_factor) == 0)
 
-            # verify that data partition start locations are 4-byte aligned
-            for partitioned_data in data_parallel_partitions:
-                assert (partitioned_data.data_ptr() %
-                        (2 * self.nccl_start_alignment_factor) == 0)
-
-            # a partition of the fp32 master weights that will be updated by this process
+            # A partition of the fp32 master weights that will be updated by this process.
+            # Note that the params in single_partition_of_fp32_groups is cloned and detached
+            # from the origin params of the model.
             if not fp16_master_weights_and_gradients:
                 self.single_partition_of_fp32_groups.append(
                     self.parallel_partitioned_bit16_groups[i][partition_id].to(
@@ -356,7 +370,9 @@ class DeepSpeedZeroOptimizer(object):
                     self.parallel_partitioned_bit16_groups[i][partition_id].to(
                         self.device).clone().half().detach())
 
-            # modify optimizer of have flat master weight
+            # Set local optimizer to have flat params of its own partition.
+            # After this, the local optimizer will only contain its own partition of params.
+            # In that case, the local optimizer only saves the states(momentum, variance, etc.) related to its partition's params(zero stage1).
             self.single_partition_of_fp32_groups[
                 i].requires_grad = True  # keep this in case internal optimizer uses it
             param_group['params'] = [self.single_partition_of_fp32_groups[i]]
@@ -379,14 +395,15 @@ class DeepSpeedZeroOptimizer(object):
                     f"Rank: {rank} partition count {self.partition_count} and sizes{[(p.numel(), self.is_moe_param_group[i] if hasattr(self, 'is_moe_param_group') else False) for i,p in enumerate(self.single_partition_of_fp32_groups)]} "
                 )
                 dist.barrier()
-        #exit(0)
+
         self.reduce_bucket_size = int(reduce_bucket_size)
         self.allgather_bucket_size = int(allgather_bucket_size)
 
-        self.reduction_event = torch.cuda.Event(enable_timing=False, blocking=False)
-        self.reduction_stream = torch.cuda.Stream()
-        self.cpu_computation_stream = torch.cuda.Stream()
-        self.copy_grad_stream = torch.cuda.Stream()
+        self.reduction_event = get_accelerator().Event(enable_timing=False,
+                                                       blocking=False)
+        self.reduction_stream = get_accelerator().Stream()
+        self.cpu_computation_stream = get_accelerator().Stream()
+        self.copy_grad_stream = get_accelerator().Stream()
         self.callback_queued = False
 
         self.param_dict = {}
@@ -431,13 +448,13 @@ class DeepSpeedZeroOptimizer(object):
             self.norm_for_param_grads = {}
             self.local_overflow = False
             self.grad_position = {}
-            self.temp_grad_buffer_for_cpu_offload = torch.zeros(
-                largest_param_numel,
-                device=self.device,
-                dtype=self.dtype).pin_memory()
+            self.temp_grad_buffer_for_cpu_offload = get_accelerator().pin_memory(
+                torch.zeros(largest_param_numel,
+                            device=self.device,
+                            dtype=self.dtype))
             self.temp_grad_buffer_for_gpu_offload = torch.zeros(
                 largest_param_numel,
-                device=torch.cuda.current_device(),
+                device=get_accelerator().current_device_name(),
                 dtype=self.dtype)
             for i, params_group in enumerate(self.bit16_groups):
                 self.get_grad_position(i,
@@ -469,6 +486,9 @@ class DeepSpeedZeroOptimizer(object):
         # will store the averaged gradients required by this partition
         self.averaged_gradients = {}
 
+        # For cpu_offload, will store the averaged gradients required by this partition
+        self.offload_gradient_dict = {}
+
         # store index of first parameter in each partition
         self.first_param_index_in_partition = {}
 
@@ -482,22 +502,15 @@ class DeepSpeedZeroOptimizer(object):
         if self.partition_gradients or self.overlap_comm:
             self.create_reduce_and_remove_grad_hooks()
 
-        # we may have a way of fusing dynamic scale. Do not support for now
-        if self.dtype == torch.float or self.dtype == torch.bfloat16 or not dynamic_loss_scale:
-            loss_scale_value = 1.0 if (
-                (self.dtype == torch.float) or
-                (self.dtype == torch.bfloat16)) else static_loss_scale
-
-            self.dynamic_loss_scale = False
-            self.loss_scaler = LossScaler(scale=loss_scale_value)
-            cur_iter = 0
-        else:
-            if dynamic_loss_args is None:
-                self.loss_scaler = DynamicLossScaler()
-            else:
-                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
+        self.custom_loss_scaler = False
+        self.external_loss_scale = None
 
-            self.dynamic_loss_scale = True
+        # we may have a way of fusing dynamic scale. Do not support for now
+        self.loss_scaler = CreateLossScaler(dtype=self.dtype,
+                                            static_loss_scale=static_loss_scale,
+                                            dynamic_scaling=dynamic_loss_scale,
+                                            dynamic_loss_args=dynamic_loss_args)
+        self.dynamic_loss_scale = self.loss_scaler.dynamic
 
         see_memory_usage("Before initializing optimizer states", force=True)
         self.initialize_optimizer_states()
@@ -509,11 +522,61 @@ class DeepSpeedZeroOptimizer(object):
         if dist.get_rank(group=self.dp_process_group) == 0:
             see_memory_usage(f"After initializing ZeRO optimizer", force=True)
 
+        self._link_all_hp_params()
+        self._enable_universal_checkpoint()
+        self._param_slice_mappings = self._create_param_mapping()
+
+    def _enable_universal_checkpoint(self):
+        for lp_param_group in self.bit16_groups:
+            enable_universal_checkpoint(param_list=lp_param_group)
+
+    def _create_param_mapping(self):
+        param_mapping = []
+        for i, _ in enumerate(self.optimizer.param_groups):
+            param_mapping_per_group = OrderedDict()
+            for lp in self.bit16_groups[i]:
+                if lp._hp_mapping is not None:
+                    lp_name = self.param_names[lp]
+                    param_mapping_per_group[
+                        lp_name] = lp._hp_mapping.get_hp_fragment_address()
+            param_mapping.append(param_mapping_per_group)
+
+        return param_mapping
+
+    def _link_all_hp_params(self):
+        dp_world_size = dist.get_world_size(group=self.dp_process_group)
+        if self.cpu_offload:
+            self._get_offload_gradient_dict()
+
+        for i, _ in enumerate(self.optimizer.param_groups):
+            # Link bit16 and fp32 params in partition
+            partition_id = dist.get_rank(group=self.real_dp_process_group[i])
+            partition_size = self.bit16_groups_flat[i].numel() // dp_world_size
+            flat_hp_partition = self.single_partition_of_fp32_groups[i]
+            link_hp_params(
+                lp_param_list=self.bit16_groups[i],
+                flat_hp_partition=flat_hp_partition,
+                gradient_dict=self.averaged_gradients,
+                offload_gradient_dict=self.offload_gradient_dict,
+                use_offload=self.cpu_offload,
+                param_group_index=i,
+                partition_start=partition_id * partition_size,
+                partition_size=partition_size,
+                partition_optimizer_state=self.optimizer.state[flat_hp_partition],
+                dp_group=self.real_dp_process_group[i])
+
     def is_moe_group(self, group):
         return 'moe' in group and group['moe']
 
     def _configure_moe_settings(self):
-        assert self.contiguous_gradients, "Contiguous Gradients in ZeRO Stage 2 must be set to True for MoE. Other code paths are not tested with MoE"
+        # if we're using ZeRO stage 2, ensure contiguous gradients are used
+        if self.partition_gradients:
+            assert self.contiguous_gradients, "Contiguous Gradients in ZeRO Stage 2 must be set to True for MoE. Other code paths are not tested with MoE"
+        # NOTE: To run ZeRO stage 1 with MoE, we need to set self.contiguous_gradients to True or ignore the assertion
+        if not self.partition_gradients and not self.contiguous_gradients:
+            logger.warn(
+                "ZeRO Stage 1 has not been thoroughly tested with MoE. This configuration is still experimental."
+            )
         assert self.reduce_scatter, "Reduce Scatter in ZeRO Stage 2 must be set to True for MoE. Other code paths are not tested with MoE"
 
         assert any([self.is_moe_group(group) for group in self.optimizer.param_groups]), "The model has moe layers, but None of the param groups are marked as MoE. Create a param group with 'moe' key set to True before creating optimizer"
@@ -579,9 +642,8 @@ class DeepSpeedZeroOptimizer(object):
                 int(self.partition_size[i]),
                 dtype=self.single_partition_of_fp32_groups[i].dtype,
                 device=self.device)
-            self.single_partition_of_fp32_groups[
-                i].grad = single_grad_partition.pin_memory(
-                ) if self.cpu_offload else single_grad_partition
+            self.single_partition_of_fp32_groups[i].grad = get_accelerator().pin_memory(
+                single_grad_partition) if self.cpu_offload else single_grad_partition
 
         self.optimizer.step()
 
@@ -603,7 +665,7 @@ class DeepSpeedZeroOptimizer(object):
             self.ipg_buffer = []
             buf_0 = torch.empty(int(self.reduce_bucket_size),
                                 dtype=self.dtype,
-                                device=torch.cuda.current_device())
+                                device=get_accelerator().current_device_name())
             self.ipg_buffer.append(buf_0)
             self.ipg_index = 0
 
@@ -664,7 +726,7 @@ class DeepSpeedZeroOptimizer(object):
             self.params_already_reduced[i] = False
 
         if self.overlap_comm:
-            torch.cuda.synchronize()
+            get_accelerator().synchronize()
             # It is safe to clear previously reduced grads of other partitions
             self._clear_previous_reduced_grads()
 
@@ -677,15 +739,16 @@ class DeepSpeedZeroOptimizer(object):
                         self.first_offset[i],
                         self.partition_size[i],
                         dtype=self.dtype,
-                        device=torch.cuda.current_device(),
+                        device=get_accelerator().current_device_name(),
                         return_tensor_list=True)
                 else:
-                    avg_new = self.get_flat_partition(self.params_in_partition[i],
-                                                      self.first_offset[i],
-                                                      self.partition_size[i],
-                                                      dtype=self.dtype,
-                                                      device=torch.cuda.current_device(),
-                                                      return_tensor_list=True)
+                    avg_new = self.get_flat_partition(
+                        self.params_in_partition[i],
+                        self.first_offset[i],
+                        self.partition_size[i],
+                        dtype=self.dtype,
+                        device=get_accelerator().current_device_name(),
+                        return_tensor_list=True)
 
                     for accumulated_grad, new_avg_grad in zip(self.averaged_gradients[i], avg_new):
                         accumulated_grad.add_(new_avg_grad)
@@ -695,7 +758,7 @@ class DeepSpeedZeroOptimizer(object):
         # No need to keep the gradients anymore.
         # All gradients required by the step
         # are in self.averaged_gradients
-        self.zero_grad()
+        self.zero_grad(set_to_none=True)
         see_memory_usage(f"End ipg_epilogue")
 
     # resets all partition to no reduced
@@ -878,12 +941,12 @@ class DeepSpeedZeroOptimizer(object):
 
     def average_tensor(self, tensor):
         if self.overlap_comm:
-            torch.cuda.synchronize()
             stream = self.reduction_stream
+            stream.wait_stream(get_accelerator().current_stream())
         else:
-            stream = torch.cuda.current_stream()
+            stream = get_accelerator().current_stream()
 
-        with torch.cuda.stream(stream):
+        with get_accelerator().stream(stream):
             if not self.reduce_scatter:
                 self.gradient_reduction_w_predivide(tensor)
                 return
@@ -950,14 +1013,18 @@ class DeepSpeedZeroOptimizer(object):
             if not self.ipg_bucket_has_moe_params:
                 tensor.div_(dist.get_world_size(group=self.dp_process_group))
 
+            tensor_to_reduce = tensor
+            if self.communication_data_type != tensor.dtype:
+                tensor_to_reduce = tensor.to(self.communication_data_type)
+
             async_handles = []
             for i, (dst, bucket_offset, numel) in enumerate(rank_and_offsets):
-                grad_slice = tensor.narrow(0, int(bucket_offset), int(numel))
+                grad_slice = tensor_to_reduce.narrow(0, int(bucket_offset), int(numel))
                 # if dist.get_rank() == 0:
                 #     print(f"Rank {dist.get_rank()} rank offset id {i} real dp size {dist.get_world_size(group=real_dp_process_group[i])} and dst: {dst}")
                 # dist.barrier()
                 #dist.barrier()
-                dst_rank = _get_global_rank(real_dp_process_group[i], dst)
+                dst_rank = dist.get_global_rank(real_dp_process_group[i], dst)
                 async_handle = dist.reduce(grad_slice,
                                            dst=dst_rank,
                                            group=real_dp_process_group[i],
@@ -967,6 +1034,9 @@ class DeepSpeedZeroOptimizer(object):
             for handle in async_handles:
                 handle.wait()
 
+            if self.communication_data_type != tensor.dtype:
+                tensor.copy_(tensor_to_reduce)
+
     ##############################################################################
     ############################# CPU Offload Methods#############################
     ##############################################################################
@@ -978,7 +1048,6 @@ class DeepSpeedZeroOptimizer(object):
             param_start_offset = 0
 
             num_elements = tensor.numel()
-            tensor_offset = 0
 
             # we need to offset to get to the right element
             if i == 0 and first_offset > 0:
@@ -1002,6 +1071,18 @@ class DeepSpeedZeroOptimizer(object):
         if param.grad is not None and self._has_inf_or_nan(param.grad.data):
             self.local_overflow = True
 
+    def _get_offload_gradient_dict(self):
+        for param_group_index, _ in enumerate(self.optimizer.param_groups):
+            self.offload_gradient_dict[param_group_index] = []
+            for lp_param in self.params_in_partition[param_group_index]:
+                param_id = self.get_param_id(lp_param)
+                [_, _, dest_offset, num_elements] = self.grad_position[param_id]
+                dest_tensor = self.single_partition_of_fp32_groups[
+                    param_group_index].grad.view(-1).narrow(0,
+                                                            dest_offset,
+                                                            num_elements)
+                self.offload_gradient_dict[param_group_index].append(dest_tensor)
+
     def async_accumulate_grad_in_cpu_via_gpu(self, param):
         param_id = self.get_param_id(param)
 
@@ -1016,9 +1097,10 @@ class DeepSpeedZeroOptimizer(object):
         #buffer for storing gradients for this parameter in CPU
         def buffer_to_accumulate_to_in_cpu():
             if not self.fp16_master_weights_and_gradients:
-                return torch.zeros(param.numel(),
-                                   dtype=param.dtype,
-                                   device=self.device).pin_memory()
+                return get_accelerator().pin_memory(
+                    torch.zeros(param.numel(),
+                                dtype=param.dtype,
+                                device=self.device))
             else:
                 return self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(
                     0,
@@ -1137,13 +1219,12 @@ class DeepSpeedZeroOptimizer(object):
                     """
 
         # Sum across all model parallel GPUs.
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-        torch.distributed.all_reduce(total_norm_cuda,
-                                     op=torch.distributed.ReduceOp.SUM,
-                                     group=self.dp_process_group)
+        total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+        dist.all_reduce(total_norm_cuda,
+                        op=dist.ReduceOp.SUM,
+                        group=self.dp_process_group)
 
-        self._model_parallel_all_reduce(tensor=total_norm_cuda,
-                                        op=torch.distributed.ReduceOp.SUM)
+        self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
 
         total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
@@ -1177,9 +1258,10 @@ class DeepSpeedZeroOptimizer(object):
                     total_size += param_in_partition.numel()
 
             see_memory_usage(f"before copying {total_size} gradients into partition")
-            self.grads_in_partition = torch.empty(int(total_size),
-                                                  dtype=self.dtype,
-                                                  device=torch.cuda.current_device())
+            self.grads_in_partition = torch.empty(
+                int(total_size),
+                dtype=self.dtype,
+                device=get_accelerator().current_device_name())
             see_memory_usage(f"after copying {total_size} gradients into partition")
 
         # The allreduce buffer will be rewritten. Copy the gradients in partition to a new buffer
@@ -1213,13 +1295,13 @@ class DeepSpeedZeroOptimizer(object):
             stream = self.reduction_stream
         elif self.cpu_offload:
             # TODO: copy_grad_stream is disabled because of race with reduce. This hurts perf and should be fixed.
-            #            torch.cuda.synchronize()
+            #            get_accelerator().synchronize()
             #            stream = self.copy_grad_stream
-            stream = torch.cuda.current_stream()
+            stream = get_accelerator().current_stream()
         else:
-            stream = torch.cuda.current_stream()
+            stream = get_accelerator().current_stream()
 
-        with torch.cuda.stream(stream):
+        with get_accelerator().stream(stream):
             for _, param, param_id in self.params_in_ipg_bucket:
 
                 assert self.params_already_reduced[param_id] == False, \
@@ -1323,11 +1405,7 @@ class DeepSpeedZeroOptimizer(object):
                 param.grad = torch.zero_like(param)
 
     ######################Reduction Related Methods##############################
-    def allreduce_bucket(self,
-                         bucket,
-                         communication_data_type=torch.float16,
-                         rank=None,
-                         log=None):
+    def allreduce_bucket(self, bucket, rank=None, log=None):
         rank = None
         tensor = self.flatten(bucket)
 
@@ -1335,6 +1413,8 @@ class DeepSpeedZeroOptimizer(object):
 
         if pg_correctness_test:
             communication_data_type = torch.float32
+        else:
+            communication_data_type = self.communication_data_type
 
         if communication_data_type != tensor.dtype:
             tensor_to_allreduce = tensor.to(communication_data_type)
@@ -1345,7 +1425,7 @@ class DeepSpeedZeroOptimizer(object):
             #    "All Reducing"
             dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
         else:
-            global_rank = _get_global_rank(self.dp_process_group, rank)
+            global_rank = dist.get_global_rank(self.dp_process_group, rank)
             dist.reduce(tensor_to_allreduce, global_rank, group=self.dp_process_group)
 
         if communication_data_type != tensor.dtype and tensor is not tensor_to_allreduce:
@@ -1363,14 +1443,14 @@ class DeepSpeedZeroOptimizer(object):
     # if rank is specified do a reduction instead of an allreduce
     def allreduce_and_copy(self, small_bucket, rank=None, log=None):
         if self.overlap_comm:
-            torch.cuda.synchronize()
+            get_accelerator().synchronize()
             # It is safe to clear the previously reduced grads of other partitions
             self._clear_previous_reduced_grads()
             stream = self.reduction_stream
         else:
-            stream = torch.cuda.current_stream()
+            stream = get_accelerator().current_stream()
 
-        with torch.cuda.stream(stream):
+        with get_accelerator().stream(stream):
             allreduced = self.allreduce_bucket(small_bucket, rank=rank, log=log)
             if rank is None or rank == dist.get_rank(group=self.dp_process_group):
                 for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)):
@@ -1418,7 +1498,7 @@ class DeepSpeedZeroOptimizer(object):
         partitions = []
 
         dp = dist.get_world_size(group=self.real_dp_process_group[group_id])
-        dp_id = dist.get_rank(group=self.real_dp_process_group[group_id])
+        # dp_id = dist.get_rank(group=self.real_dp_process_group[group_id])
 
         total_num_elements = tensor.numel()
 
@@ -1465,7 +1545,7 @@ class DeepSpeedZeroOptimizer(object):
 
         return params_in_partition, params_not_in_partition, first_offset
 
-    def zero_grad(self, set_grads_to_None=True):
+    def zero_grad(self, set_to_none=False):
         """
         Zero FP16 parameter grads.
         """
@@ -1473,7 +1553,7 @@ class DeepSpeedZeroOptimizer(object):
         # For speed, set model fp16 grad to None by default
         for group in self.bit16_groups:
             for p in group:
-                if set_grads_to_None:
+                if set_to_none:
                     p.grad = None  # epilogue and in step
                 else:
                     if p.grad is not None:
@@ -1483,12 +1563,10 @@ class DeepSpeedZeroOptimizer(object):
     def _model_parallel_all_reduce(self, tensor, op):
         """ Perform all reduce within model parallel group, if any.
         """
-        if self.model_parallel_group is None:
+        if self.model_parallel_group is None or self.model_parallel_world_size == 1:
             pass
         else:
-            torch.distributed.all_reduce(tensor=tensor,
-                                         op=op,
-                                         group=self.model_parallel_group)
+            dist.all_reduce(tensor=tensor, op=op, group=self.model_parallel_group)
 
     def get_grad_norm_direct(self, gradients, params, norm_type=2):
         """Clips gradient norm of an iterable of parameters.
@@ -1510,14 +1588,13 @@ class DeepSpeedZeroOptimizer(object):
         norm_type = float(norm_type)
         if norm_type == inf:
             total_norm = max(g.data.abs().max() for g in gradients)
-            total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=self.dp_process_group)
+            total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+            dist.all_reduce(total_norm_cuda,
+                            op=dist.ReduceOp.MAX,
+                            group=self.dp_process_group)
 
             # Take max across all GPUs.
-            self._model_parallel_all_reduce(tensor=total_norm_cuda,
-                                            op=torch.distributed.ReduceOp.MAX)
+            self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.MAX)
             total_norm = total_norm_cuda[0].item()
         else:
             total_norm = 0.0
@@ -1531,13 +1608,12 @@ class DeepSpeedZeroOptimizer(object):
                     param_norm = g.data.double().norm(2)
                     total_norm += param_norm.item()**2
             # Sum across all model parallel GPUs.
-            total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.SUM,
-                                         group=self.dp_process_group)
+            total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+            dist.all_reduce(total_norm_cuda,
+                            op=dist.ReduceOp.SUM,
+                            group=self.dp_process_group)
 
-            self._model_parallel_all_reduce(tensor=total_norm_cuda,
-                                            op=torch.distributed.ReduceOp.SUM)
+            self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
 
             total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
@@ -1628,6 +1704,61 @@ class DeepSpeedZeroOptimizer(object):
         for name in timer_names:
             self.timers(name).stop()
 
+    def set_lr(self, lr):
+        """Set the learning rate."""
+        for param_group in self.optimizer.param_groups:
+            param_group["lr"] = lr
+
+    def get_lr(self):
+        """Return the current learning rate."""
+        return self.optimizer.param_groups[0]["lr"]
+
+    def override_loss_scale(self, loss_scale):
+        if loss_scale != self.external_loss_scale:
+            logger.info(
+                f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}'
+            )
+        self.custom_loss_scaler = True
+        self.external_loss_scale = loss_scale
+
+    def scaled_global_norm(self, norm_type=2):
+        assert norm_type == 2, "only L2 norm supported"
+        norm_groups = []
+        for i, group in enumerate(self.bit16_groups):
+            partition_id = dist.get_rank(group=self.real_dp_process_group[i])
+            if self.cpu_offload:
+                norm_groups.append(
+                    self.complete_grad_norm_calculation_for_cpu_offload(
+                        self.params_in_partition[i]))
+                single_grad_partition = self.single_partition_of_fp32_groups[i].grad
+            else:
+                norm_groups.append(
+                    self.get_grad_norm_direct(self.averaged_gradients[i],
+                                              self.params_in_partition[i]))
+
+        if self.has_moe_layers:
+            self._average_expert_grad_norms(norm_groups)
+
+        # note that the get_global_norm function only supports l2 norm
+        return get_global_norm(norm_list=norm_groups)
+
+    def get_bit16_param_group(self, group_no):
+        bit16_partitions = self.parallel_partitioned_bit16_groups[group_no]
+        partition_id = dist.get_rank(group=self.real_dp_process_group[group_no])
+        return [
+            bit16_partitions[dist.get_rank(group=self.real_dp_process_group[group_no])]
+        ]
+
+    def _optimizer_step(self, group_no):
+        original_param_groups = self.optimizer.param_groups
+        self.optimizer.param_groups = [original_param_groups[group_no]]
+        from deepspeed.ops.adam import DeepSpeedCPUAdam
+        if type(self.optimizer) == DeepSpeedCPUAdam and self.dtype == torch.half:
+            self.optimizer.step(fp16_param_groups=[self.get_bit16_param_group(group_no)])
+        else:
+            self.optimizer.step()
+        self.optimizer.param_groups = original_param_groups
+
     def step(self, closure=None):
         """
         Not supporting closure.
@@ -1646,16 +1777,14 @@ class DeepSpeedZeroOptimizer(object):
         prev_scale = self.loss_scale
         self._update_scale(self.overflow)
         if self.overflow:
-
             if dist.get_rank() == 0:
-                logger.info(
-                    "[deepspeed] OVERFLOW! Rank {} Skipping step. Attempted loss scale: {}, "
-                    "reducing to {}".format(dist.get_rank(),
-                                            prev_scale,
-                                            self.loss_scale))
+                overflow_msg = f"[deepspeed] OVERFLOW! Rank {dist.get_rank()} Skipping step."
+                if self.dtype == torch.half:
+                    overflow_msg += f" Attempted loss scale: {prev_scale}, reducing to {self.loss_scale}"
+                logger.info(overflow_msg)
 
             see_memory_usage('After overflow before clearing gradients')
-            self.zero_grad()
+            self.zero_grad(set_to_none=True)
             if self.cpu_offload:
                 self.reset_cpu_buffers()
             else:
@@ -1667,23 +1796,34 @@ class DeepSpeedZeroOptimizer(object):
             self.stop_timers(timer_names)
             return
 
-        self.start_timers([OPTIMIZER_GRADIENTS])
-        norm_groups = []
-        single_partition_grad_groups = []
-        skip = False
+        # Step 1:- Calculate gradient norm using fp-16 grads
+        see_memory_usage('Before norm calculation')
+        scaled_global_grad_norm = self.scaled_global_norm()
+        self._global_grad_norm = scaled_global_grad_norm / prev_scale
+
+        see_memory_usage('After norm before optimizer')
+        # Step 2:- run optimizer and upscaling simultaneously
         for i, group in enumerate(self.bit16_groups):
+            self.start_timers([OPTIMIZER_GRADIENTS])
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
             if self.cpu_offload:
-                norm_groups.append(
-                    self.complete_grad_norm_calculation_for_cpu_offload(
-                        self.params_in_partition[i]))
                 single_grad_partition = self.single_partition_of_fp32_groups[i].grad
-            else:
-                norm_groups.append(
-                    self.get_grad_norm_direct(self.averaged_gradients[i],
-                                              self.params_in_partition[i]))
+                self.unscale_and_clip_grads([single_grad_partition],
+                                            scaled_global_grad_norm)
+                self.stop_timers([OPTIMIZER_GRADIENTS])
+                self.start_timers([OPTIMIZER_STEP])
+                self._optimizer_step(i)
+
+                from deepspeed.ops.adam import DeepSpeedCPUAdam
+                if not (type(self.optimizer) == DeepSpeedCPUAdam
+                        and self.dtype == torch.half):
+                    bit16_partitions = self.parallel_partitioned_bit16_groups[i]
+                    fp32_partition = self.single_partition_of_fp32_groups[i]
+                    bit16_partitions[partition_id].data.copy_(fp32_partition.data)
 
-                # free gradients for all the parameters that are not updated by this process
+                self.stop_timers([OPTIMIZER_STEP])
+            else:
+                # free gradients for all the parameters that are not updated by this process(ZeRO stage2)
                 self.free_grad_in_param_list(self.params_not_in_partition[i])
 
                 # create a flat gradients for parameters updated by this process
@@ -1702,55 +1842,33 @@ class DeepSpeedZeroOptimizer(object):
                         single_grad_partition.numel(), self.partition_size[i], i, partition_id)
 
                 self.single_partition_of_fp32_groups[i].grad = single_grad_partition
-                # release all the gradient since we have already created a necessary copy in dp_grad_partition
+                # release all the gradient since we have already created a necessary copy in dp_grad_partition(ZeRO stage2)
                 self.free_grad_in_param_list(self.params_in_partition[i])
 
                 self.averaged_gradients[i] = None
 
-            single_partition_grad_groups.append(single_grad_partition)
-
-        if self.has_moe_layers:
-            self._average_expert_grad_norms(norm_groups)
-
-        scaled_global_grad_norm = get_global_norm(norm_list=norm_groups)
-        self.unscale_and_clip_grads(single_partition_grad_groups,
-                                    scaled_global_grad_norm)
-
-        # Stash unscaled gradient norm
-        self._global_grad_norm = scaled_global_grad_norm / self.loss_scale
-
-        self.stop_timers([OPTIMIZER_GRADIENTS])
-
-        self.start_timers([OPTIMIZER_STEP])
-        if self.deepspeed_adam_offload:
-            from deepspeed.ops.adam import DeepSpeedCPUAdam
-            if type(self.optimizer) == DeepSpeedCPUAdam and self.dtype == torch.half:
-                bit16_param_groups = [[
-                    bit16_partitions[partition_id]
-                ] for bit16_partitions in self.parallel_partitioned_bit16_groups]
-                self.optimizer.step(fp16_param_groups=bit16_param_groups)
-            else:
-                self.optimizer.step()
-                for bit16_partitions, fp32_partition in zip(self.parallel_partitioned_bit16_groups, self.single_partition_of_fp32_groups):
-                    bit16_partitions[partition_id].data.copy_(fp32_partition.data)
-        else:
-            self.optimizer.step()
-
-            # get rid of the fp32 gradients. Not needed anymore
-            if not self.cpu_offload:
-                for group in self.single_partition_of_fp32_groups:
-                    group.grad = None  # in step
-
-            for bit16_partitions, fp32_partition in zip(self.parallel_partitioned_bit16_groups, self.single_partition_of_fp32_groups):
+                self.unscale_and_clip_grads([single_grad_partition],
+                                            scaled_global_grad_norm)
+                self.stop_timers([OPTIMIZER_GRADIENTS])
+
+                # Step 3:- run the optimizer if no offloading
+                self.start_timers([OPTIMIZER_STEP])
+                self._optimizer_step(i)
+                # Step 4:- get rid of the fp32 gradients. Not needed anymore
+                self.single_partition_of_fp32_groups[i].grad = None
+                del single_grad_partition
+                bit16_partitions = self.parallel_partitioned_bit16_groups[i]
+                fp32_partition = self.single_partition_of_fp32_groups[i]
                 bit16_partitions[partition_id].data.copy_(fp32_partition.data)
+                self.stop_timers([OPTIMIZER_STEP])
 
-        self.stop_timers([OPTIMIZER_STEP])
-
+        see_memory_usage('After optimizer before all-gather')
         if self.cpu_offload:
             self.reset_cpu_buffers()
 
         self.start_timers([OPTIMIZER_ALLGATHER])
-        # gather the updated weights from everyone
+        # Gather the updated weights from everyone.
+        # Then all partitions of the model parameters are updated and ready for next round forward.
         all_gather_dp_groups(
             partitioned_param_groups=self.parallel_partitioned_bit16_groups,
             dp_process_group=self.real_dp_process_group,
@@ -1760,7 +1878,7 @@ class DeepSpeedZeroOptimizer(object):
         self.stop_timers([OPTIMIZER_ALLGATHER])
 
         # TODO: we probably don't need this? just to be safe
-        for i in range(len(norm_groups)):
+        for i in range(len(self.bit16_groups)):
             self._update_model_bit16_weights(i)
 
         self.log_timers(timer_names)
@@ -1768,13 +1886,28 @@ class DeepSpeedZeroOptimizer(object):
 
         return
 
+    @torch.no_grad()
+    def update_lp_params(self):
+        for i, (bit16_partitions, fp32_partition) in enumerate(zip(self.parallel_partitioned_bit16_groups, self.single_partition_of_fp32_groups)):
+            partition_id = dist.get_rank(group=self.real_dp_process_group[i])
+            bit16_partitions[partition_id].data.copy_(fp32_partition.data)
+            # print_rank_0(f'update_lp_params {i=} {partition_id=}', force=True)
+            # if i == 0:
+            #     print_rank_0(f'{fp32_partition[:10]=}', force=True)
+
+        all_gather_dp_groups(
+            partitioned_param_groups=self.parallel_partitioned_bit16_groups,
+            dp_process_group=self.real_dp_process_group,
+            start_alignment_factor=self.nccl_start_alignment_factor,
+            allgather_bucket_size=self.allgather_bucket_size)
+
     def _average_expert_grad_norms(self, norm_groups):
         for i, norm in enumerate(norm_groups):
             if self.is_moe_param_group[i]:
                 scaled_norm = norm * 1.0 / float(
                     dist.get_world_size(group=self.real_dp_process_group[i]))
                 scaled_norm_tensor = torch.tensor(scaled_norm,
-                                                  device='cuda',
+                                                  device=get_accelerator().device_name(),
                                                   dtype=torch.float)
                 dist.all_reduce(scaled_norm_tensor, group=self.real_dp_process_group[i])
                 norm_groups[i] = scaled_norm_tensor.item()
@@ -1818,12 +1951,12 @@ class DeepSpeedZeroOptimizer(object):
         if partition_gradients:
             overflow = self.local_overflow if self.cpu_offload else self.has_overflow_partitioned_grads_serial(
             )
-            overflow_gpu = torch.cuda.ByteTensor([overflow])
+            overflow_gpu = get_accelerator().ByteTensor([overflow])
             '''This will capture overflow across all data parallel and expert parallel process
             Since expert parallel process are a subset of data parallel process'''
-            torch.distributed.all_reduce(overflow_gpu,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=self.dp_process_group)
+            dist.all_reduce(overflow_gpu,
+                            op=dist.ReduceOp.MAX,
+                            group=self.dp_process_group)
 
         else:
             params = []
@@ -1832,12 +1965,11 @@ class DeepSpeedZeroOptimizer(object):
                     params.append(param)
 
             overflow = self.has_overflow_serial(params, is_grad_list=partition_gradients)
-            overflow_gpu = torch.cuda.ByteTensor([overflow])
+            overflow_gpu = get_accelerator().ByteTensor([overflow])
 
         # Since each model parallel GPU carries only part of the model,
         # make sure overflow flag is synced across all the model parallel GPUs
-        self._model_parallel_all_reduce(tensor=overflow_gpu,
-                                        op=torch.distributed.ReduceOp.MAX)
+        self._model_parallel_all_reduce(tensor=overflow_gpu, op=dist.ReduceOp.MAX)
 
         overflow = overflow_gpu[0].item()
         return bool(overflow)
@@ -1878,18 +2010,22 @@ class DeepSpeedZeroOptimizer(object):
             self.ipg_buffer = []
             buf_0 = torch.empty(int(self.reduce_bucket_size),
                                 dtype=self.dtype,
-                                device=torch.cuda.current_device())
+                                device=get_accelerator().current_device_name())
             self.ipg_buffer.append(buf_0)
 
             # Use double buffers to avoid data access conflict when overlap_comm is enabled.
             if self.overlap_comm:
                 buf_1 = torch.empty(int(self.reduce_bucket_size),
                                     dtype=self.dtype,
-                                    device=torch.cuda.current_device())
+                                    device=get_accelerator().current_device_name())
                 self.ipg_buffer.append(buf_1)
             self.ipg_index = 0
 
-        self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
+        if self.custom_loss_scaler:
+            scaled_loss = self.external_loss_scale * loss
+            scaled_loss.backward()
+        else:
+            self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
 
     def check_overflow(self, partition_gradients=True):
         self._check_overflow(partition_gradients)
@@ -1918,7 +2054,10 @@ class DeepSpeedZeroOptimizer(object):
 
     # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
     def _get_loss_scale(self):
-        return self.loss_scaler.loss_scale
+        if self.custom_loss_scaler:
+            return self.external_loss_scale
+        else:
+            return self.loss_scaler.cur_scale
 
     def _set_loss_scale(self, value):
         self.loss_scaler.cur_scale = value
@@ -1988,10 +2127,13 @@ class DeepSpeedZeroOptimizer(object):
             self.single_partition_of_fp32_groups)
         state_dict[SINGLE_PARTITION_OF_FP32_GROUPS] = fp32_groups_without_padding
 
-        state_dict[ZERO_STAGE] = ZERO_OPTIMIZATION_GRADIENTS
+        state_dict[
+            ZERO_STAGE] = ZeroStageEnum.gradients if self.partition_gradients else ZeroStageEnum.optimizer_states
+        state_dict[GROUP_PADDINGS] = self.groups_padding
         state_dict[PARTITION_COUNT] = self.partition_count
 
         state_dict[DS_VERSION] = version
+        state_dict[PARAM_SLICE_MAPPINGS] = self._param_slice_mappings
 
         return state_dict
 
@@ -2101,7 +2243,47 @@ class DeepSpeedZeroOptimizer(object):
     def load_state_dict(self,
                         state_dict_list,
                         load_optimizer_states=True,
-                        load_from_fp32_weights=False):
+                        load_from_fp32_weights=False,
+                        checkpoint_folder=None):
+        if checkpoint_folder:
+            self._load_universal_checkpoint(checkpoint_folder,
+                                            load_optimizer_states,
+                                            load_from_fp32_weights)
+        else:
+            self._load_legacy_checkpoint(state_dict_list,
+                                         load_optimizer_states,
+                                         load_from_fp32_weights)
+
+    def _load_universal_checkpoint(self,
+                                   checkpoint_folder,
+                                   load_optimizer_states,
+                                   load_from_fp32_weights):
+        self._load_hp_checkpoint_state(checkpoint_folder)
+
+    @property
+    def param_groups(self):
+        """Forward the wrapped optimizer's parameters."""
+        return self.optimizer.param_groups
+
+    def _load_hp_checkpoint_state(self, checkpoint_dir):
+        checkpoint_dir = os.path.join(checkpoint_dir, "zero")
+        tp_rank = bwc_tensor_model_parallel_rank(mpu=self.mpu)
+        tp_world_size = self.mpu.get_slice_parallel_world_size()
+
+        for i, _ in enumerate(self.optimizer.param_groups):
+            for lp in self.bit16_groups[i]:
+                if lp._hp_mapping is not None:
+                    #print(f"Loading {self.param_names[lp]} {tp_rank=} {tp_world_size=}")
+                    lp.load_hp_checkpoint_state(
+                        os.path.join(checkpoint_dir,
+                                     self.param_names[lp]),
+                        tp_rank,
+                        tp_world_size)
+
+    def _load_legacy_checkpoint(self,
+                                state_dict_list,
+                                load_optimizer_states=True,
+                                load_from_fp32_weights=False):
         r"""Loading ZeRO checkpoint
 
         Arguments:
@@ -2119,7 +2301,7 @@ class DeepSpeedZeroOptimizer(object):
         will call ``model.load_state_dict()`` before
         ``fp16_optimizer_instance.load_state_dict()`` is called.
         Example::
-            model = torch.nn.Linear(D_in, D_out).cuda().half()
+            model = torch.nn.Linear(D_in, D_out).to(get_accelerator().device_name()).half()
             optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
             optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
             ...
@@ -2151,6 +2333,16 @@ class DeepSpeedZeroOptimizer(object):
 
         ckpt_is_rigid = isinstance(current_rank_sd[BASE_OPTIMIZER_STATE], dict)
 
+        # padding is always at the last rank/partition
+        # if DP=1024 and param-group elems=16 -> padding will be 1024-16 across all but one rank
+        # scenario-1 (shrink): saving w. 4 gpus -> loading w. 2 gpus
+        # scenario-2 (expand): saving w. 2 gpus -> loading w. 4 gpus
+        # if load_optimizer_states:
+        #     if new_dp_size:
+        #         self.strip_padding()
+        #         self.add_padding_w_new_dp_size()
+        #     self.optimizer.load_state_dict(current_rank_sd[BASE_OPTIMIZER_STATE])
+
         if load_optimizer_states:
             if ckpt_is_rigid:
                 # loading rigid ckpt into either rigid or elastic exec
@@ -2192,10 +2384,13 @@ class DeepSpeedZeroOptimizer(object):
             # option 1 from above
             self._restore_from_bit16_weights()
 
+        if load_optimizer_states:
+            self._link_all_hp_params()
+
 
 def _handle_overflow(cpu_sum, x, i):
     import math
-    rank = torch.distributed.get_rank()
+    rank = dist.get_rank()
     if rank == 0:
         t_i = -1
         for v_i, v in enumerate(x.data.contiguous().view(-1)):
@@ -2287,8 +2482,8 @@ def estimate_zero2_model_states_mem_needs_all_cold(total_params,
     """
     def format_options(cpu_offload):
         enabled = []
-        device = f'{OFFLOAD_CPU_DEVICE:4}' if cpu_offload else "none"
-        enabled.append(f"{OFFLOAD_OPTIMIZER}={device}")
+        device = f'{OffloadDeviceEnum.cpu:4}' if cpu_offload else "none"
+        enabled.append(f"offload_optimizer={device}")
         return ", ".join(enabled)
 
     nodes_str = "nodes" if num_nodes > 1 else "node"
diff --git a/deepspeed/runtime/zero/test.py b/deepspeed/runtime/zero/test.py
index 29213d604ce54dd58320667ff6ae6a2204731996..d13c247b3de47a2ccc6681084e64ebe46d833ca0 100644
--- a/deepspeed/runtime/zero/test.py
+++ b/deepspeed/runtime/zero/test.py
@@ -1,3 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import torch
 from deepspeed.runtime.zero.contiguous_memory_allocator import ContiguousMemoryAllocator
 
diff --git a/deepspeed/runtime/zero/tiling.py b/deepspeed/runtime/zero/tiling.py
index 3a78253df4969d1f07eb7d40b9dfd2d371e70efc..d66caea85ee76ef8c0ad84ac0228b22da3ec2f10 100644
--- a/deepspeed/runtime/zero/tiling.py
+++ b/deepspeed/runtime/zero/tiling.py
@@ -1,3 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import torch
 import deepspeed
 from deepspeed.runtime.utils import partition_uniform as partition
diff --git a/deepspeed/runtime/zero/utils.py b/deepspeed/runtime/zero/utils.py
old mode 100644
new mode 100755
index fff8ebd31ef38afc61e570fff567ca6e61d2c90b..eafddad3f35fa07be4b7df427ff716d60264ad6a
--- a/deepspeed/runtime/zero/utils.py
+++ b/deepspeed/runtime/zero/utils.py
@@ -1,12 +1,15 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import os
 from typing import List
 
 import torch
-import torch.distributed as dist
+from deepspeed import comm as dist
 from deepspeed.utils import logger
 from deepspeed.ops.adam import DeepSpeedCPUAdam
 from deepspeed.ops.adam import FusedAdam
 from deepspeed.utils.nvtx import instrument_w_nvtx
+from deepspeed.accelerator import get_accelerator
 
 
 def _initialize_parameter_parallel_groups(parameter_parallel_size=None):
@@ -21,7 +24,7 @@ def _initialize_parameter_parallel_groups(parameter_parallel_size=None):
     my_group = None
     for i in range(data_parallel_size // parameter_parallel_size):
         ranks = range(i * parameter_parallel_size, (i + 1) * parameter_parallel_size)
-        group = torch.distributed.new_group(ranks)
+        group = dist.new_group(ranks)
         if rank in ranks:
             my_group = group
     return my_group
@@ -63,8 +66,8 @@ def get_lst_from_rank0(lst: List[int]) -> None:
     lst_tensor = torch.tensor(
         lst if dist.get_rank() == 0 else [-1] * len(lst),
         dtype=int,
-        # device=torch.cuda.current_device(),
-        device=torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"])),
+        # device=get_accelerator().current_device_name(),
+        device=torch.device(get_accelerator().device_name(os.environ["LOCAL_RANK"])),
         requires_grad=False,
     )
     dist.broadcast(lst_tensor, src=0, async_op=False)
diff --git a/deepspeed/utils/__init__.py b/deepspeed/utils/__init__.py
index efa45853f0dfc3846c99eda8aacc91062f756a80..c48383c01c60df7043ddc0028c8e85e6c0065982 100644
--- a/deepspeed/utils/__init__.py
+++ b/deepspeed/utils/__init__.py
@@ -1,5 +1,13 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from .logging import logger, log_dist
-from .distributed import init_distributed
+from .comms_logging import get_caller_func
+#from .distributed import init_distributed
+from .init_on_device import OnDevice
 from .groups import *
 from .nvtx import instrument_w_nvtx
+# TODO: Move tensor fragment and mixed precision to zero utils
+from .tensor_fragment import tensor_fragment, get_full_hp_param, get_hp_fragment_mapping, fragment_address, get_full_hp_grad
+from .tensor_fragment import safe_get_full_fp32_param, safe_get_full_grad, safe_get_full_optimizer_state
+from .mixed_precision_linkage import link_hp_params
 from deepspeed.runtime.dataloader import RepeatingLoader
diff --git a/deepspeed/utils/comms_logging.py b/deepspeed/utils/comms_logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6b0410642a9670318538d5c8df5c8842a0853d5
--- /dev/null
+++ b/deepspeed/utils/comms_logging.py
@@ -0,0 +1,141 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import math
+from deepspeed.utils import log_dist
+
+
+def get_caller_func(frame=3):
+    import sys
+    return sys._getframe(frame).f_code.co_name
+
+
+# Helper function to pretty-print message sizes
+def convert_size(size_bytes):
+    if size_bytes == 0:
+        return "0B"
+    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
+    i = int(math.floor(math.log(size_bytes, 1024)))
+    p = math.pow(1024, i)
+    s = round(size_bytes / p, 2)
+    return "%s %s" % (s, size_name[i])
+
+
+# Helper function to calculate algbw and busbw.
+# See https://gist.github.com/jeffra/b5e80466b4c86be00ea3b6f130fb7a36 and https://github.com/NVIDIA/nccl-tests/blob/master/doc/PERFORMANCE.md
+def calc_bw_log(comm_op, size, duration):
+    import deepspeed.comm as dist
+
+    n = dist.get_world_size()
+    tput = 0
+    busbw = 0
+    if comm_op == "all_to_all_single":
+        tput = (size / duration)
+        busbw = (size / duration) * ((n - 1) / n)
+    elif comm_op == "all_gather" or comm_op == "all_gather_base" or comm_op == "reduce_scatter" or comm_op == "reduce_scatter_base":
+        size *= n
+        tput = (size / duration)
+        busbw = (size / duration) * ((n - 1) / n)
+    elif comm_op == "all_reduce":
+        tput = (size * 2 / duration)
+        busbw = (size / duration) * (2 * (n - 1) / n)
+    elif comm_op == "send" or comm_op == "recv" or comm_op == "isend" or comm_op == "irecv" or comm_op == "broadcast" or comm_op == "reduce" or comm_op == "gather" or comm_op == "scatter" or comm_op == "barrier":
+        tput = (size / duration)
+        busbw = tput
+    else:
+        print_rank_0("wrong comm_op specified")  # noqa: F821
+        exit(0)
+
+    # convert to Gbps
+    tput *= 8
+    busbw *= 8
+
+    tput /= 1e6
+    busbw /= 1e6
+
+    return tput, busbw
+
+
+class CommsLogger:
+    def __init__(self):
+        from deepspeed.comm.constants import COMMS_LOGGER_VERBOSE_DEFAULT, COMMS_LOGGER_DEBUG_DEFAULT, COMMS_LOGGER_PROF_OPS_DEFAULT, COMMS_LOGGER_PROF_ALL_DEFAULT, COMMS_LOGGER_ENABLED_DEFAULT
+        self.comms_dict = {}
+        self.verbose = COMMS_LOGGER_VERBOSE_DEFAULT
+        self.debug = COMMS_LOGGER_DEBUG_DEFAULT
+        self.prof_ops = COMMS_LOGGER_PROF_OPS_DEFAULT
+        self.prof_all = COMMS_LOGGER_PROF_ALL_DEFAULT
+        self.enabled = COMMS_LOGGER_ENABLED_DEFAULT
+
+    def configure(self, comms_config):
+        self.enabled = comms_config.comms_logger_enabled
+        if self.enabled:
+            self.verbose = comms_config.comms_logger.verbose
+            self.debug = comms_config.comms_logger.debug
+            self.prof_ops = comms_config.comms_logger.prof_ops
+            self.prof_all = comms_config.comms_logger.prof_all
+
+    # There are three settings for the op profiler:
+    # - Global profiling (profile all comms)
+    # - Op-type profiling (e.g. profile all all_reduce comms)
+    # - Op profiling (e.g. profile a specific all_reduce op)
+    def start_profiling_comms(self):
+        self.prof_all = True
+
+    def stop_profiling_comms(self):
+        self.prof_all = True
+
+    # E.g. start_profiling_op('all_reduce')
+    def start_profiling_op(self, op_name_list):
+        self.prof_ops = list(set(self.prof_ops) | set(op_name_list))
+
+    def stop_profiling_op(self, op_name_list):
+        self.prof_ops = [op for op in self.prof_ops if op not in op_name_list]
+
+    # Add log entry
+    def append(self, raw_name, record_name, latency, msg_size):
+        import deepspeed.comm as dist
+        algbw, busbw = calc_bw_log(raw_name, msg_size, latency)
+        if record_name in self.comms_dict.keys():
+            # If this comm_op has already been logged with this message size, just add to existing record
+            if msg_size in self.comms_dict[record_name].keys():
+                self.comms_dict[record_name][msg_size][0] += 1
+                self.comms_dict[record_name][msg_size][1].append(latency)
+                self.comms_dict[record_name][msg_size][2].append(algbw)
+                self.comms_dict[record_name][msg_size][3].append(busbw)
+            # If this is a new message size for this comm_op, add new record under existing comm_op
+            else:
+                self.comms_dict[record_name][msg_size] = [1, [latency], [algbw], [busbw]]
+        else:
+            # Create entirely new record
+            self.comms_dict[record_name] = {msg_size: [1, [latency], [algbw], [busbw]]}
+        # If verbose, print every comm op
+        # TODO: Add to tensorboard
+        if self.verbose:
+            n = dist.get_world_size()
+            log_str = f"rank={dist.get_rank()} | comm op: " + record_name + " | time (ms): {:.2f}".format(
+                latency)
+            log_str += " | msg size: " + convert_size(msg_size)
+            log_str += " | algbw (Gbps): {:.2f} ".format(algbw)
+            log_str += " | busbw (Gbps): {:.2f} ".format(busbw)
+            log_dist(log_str, [0])
+
+    # Print summary at end of iteration, epoch, or training
+    def log_all(self):
+        from deepspeed.utils.timer import trim_mean
+        print(
+            f"{'Comm. Op': <20}{'Message Size': <20}{'Count': <20}{'Total Latency(ms)': <20}{'Avg Latency(ms)': <20}{'tput_avg (Gbps)': <20}{'busbw_avg (Gbps)': <20}"
+        )
+        for record_name in self.comms_dict.keys():
+            print(record_name)
+            for msg_size, vals in sorted(self.comms_dict[record_name].items()):
+                # vals[0] is the count for each msg size
+                count = vals[0]
+                # vals[1] is a list of latency records for each msg size
+                total_lat = sum(vals[1])
+                # vals[2] and vals[3] are the lists of algbw and busbw, respectively
+                # Get rid of outliers when we print
+                avg_lat = trim_mean(vals[1], 0.1)
+                avg_algbw = trim_mean(vals[2], 0.1)
+                avg_busbw = trim_mean(vals[3], 0.1)
+                print(
+                    f"{' ': <20}{convert_size(msg_size): <20}{count: <20}{total_lat: <20.2f}{avg_lat: <20.2f}{avg_algbw: <20.2f}{avg_busbw: <20.2f}"
+                )
diff --git a/deepspeed/utils/debug.py b/deepspeed/utils/debug.py
index 358e997eae636a26abda2953110167df325566eb..f3f3ea56d5cfa6d9c551ec7d2db2ebb41dda995b 100644
--- a/deepspeed/utils/debug.py
+++ b/deepspeed/utils/debug.py
@@ -1,6 +1,8 @@
+'''Copyright The Microsoft DeepSpeed Team'''
 """ debug utils """
 
-import fcntl
+# For lazy import with printflock()
+fcntl = None
 
 # for debug purposes map module and param objects to their fully qualified names
 module_names = {}
@@ -69,7 +71,7 @@ def printflock(*msgs):
     2. Override the usual versions with ::
 
         def print_rank_0(message, debug=False, force=False):
-            rank = torch.distributed.get_rank()
+            rank = deepspeed.comm.get_rank()
             printflock(f"[{rank}] {message}")
     3. run the program and you get both logs non-interleaved
 
@@ -78,6 +80,9 @@ def printflock(*msgs):
     then compare those.
 
     """
+    global fcntl
+    if fcntl == None:
+        import fcntl
 
     with open(__file__, "r") as fh:
         fcntl.flock(fh, fcntl.LOCK_EX)
@@ -100,7 +105,7 @@ def log_rank_file(rank, *msgs):
     2. Override the usual versions of print_rank_0 in those files with ::
 
         def print_rank_0(message, debug=False, force=False):
-            rank = torch.distributed.get_rank()
+            rank = deepspeed.comm.get_rank()
             log_rank_file(rank, message)
 
     3. run the program
diff --git a/deepspeed/utils/groups.py b/deepspeed/utils/groups.py
index 34fd96f1698af9d6ef300526d910b8215ccde2d8..a4d403a7dd0ac63f39a3fbbdcb34d6be7dd267c3 100644
--- a/deepspeed/utils/groups.py
+++ b/deepspeed/utils/groups.py
@@ -18,25 +18,27 @@ Copyright 2021 The Microsoft DeepSpeed Team
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- Support different forms of parallelism in DeepSped using multiple process groups.
+ Support different forms of parallelism in DeepSpeed using multiple process groups.
  Given that there are multiple scenarios and use-cases, this file is going to be updated
  frequently. For now, the group creation needed for the training scenario is being implemented.
  For inference and other new scenarios, the code will be either reused or added to this file.
 """
 
-import torch
-from torch.distributed.distributed_c10d import _get_global_rank
-from deepspeed.utils import logger, log_dist
+from deepspeed import comm as dist
+
+from deepspeed.utils import log_dist
 from deepspeed.utils.exceptions import DeprecatedException
 
 # Expert parallel group that the current rank belongs to.
 _EXPERT_PARALLEL_GROUP = {}
 # Expert data parallel group that the current rank belongs to.
 _EXPERT_DATA_PARALLEL_GROUP = {}
-# torch.distributed world group needs to be cloned for some cases
+# dist world group needs to be cloned for some cases
 _WORLD_GROUP = None
 # global object to maintain mpu object if passed by a Megatron client
 mpu = None
+# global object that stores tensor parallel world size for experts
+expert_tensor_parallel_world_size = 1
 
 
 # Deprecated groups initialize function.
@@ -79,32 +81,32 @@ def _create_model_parallel(model_parallel_size_):
     log_dist(f'Creating model parallel group with size {model_parallel_size_}',
              ranks=[0])
     # Get world size and rank. Ensure some consistencies.
-    assert torch.distributed.is_initialized()
-    world_size = torch.distributed.get_world_size()
+    assert dist.is_initialized()
+    world_size = dist.get_world_size()
     model_parallel_size = min(model_parallel_size_, world_size)
     _ensure_divisibility(world_size, model_parallel_size)
-    rank = torch.distributed.get_rank()
+    rank = dist.get_rank()
 
     _DATA_PARALLEL_GROUP = None
     _MODEL_PARALLEL_GROUP = None
     # Build the data parallel groups.
     for i in range(model_parallel_size):
         ranks = range(i, world_size, model_parallel_size)
-        group = torch.distributed.new_group(ranks)
+        group = dist.new_group(ranks)
         if i == (rank % model_parallel_size):
             _DATA_PARALLEL_GROUP = group
 
     # Build the model parallel groups.
     for i in range(world_size // model_parallel_size):
         ranks = range(i * model_parallel_size, (i + 1) * model_parallel_size)
-        group = torch.distributed.new_group(ranks)
+        group = dist.new_group(ranks)
         if i == (rank // model_parallel_size):
             _MODEL_PARALLEL_GROUP = group
 
     return _DATA_PARALLEL_GROUP, _MODEL_PARALLEL_GROUP
 
 
-def _create_expert_and_data_parallel(ep_size):
+def _create_expert_and_data_parallel(expert_parallel_size_):
     """
         Create expert and data parallel groups.
 
@@ -117,13 +119,14 @@ def _create_expert_and_data_parallel(ep_size):
         expert_parallel_group = [0, 1], [2,3], [4,5], [6,7], [8,9] - no all reduce, but all to all
         data_parallel_group = [0,1,...,15] - all reduce is only on non-MoE
     """
-    assert torch.distributed.is_initialized()
+    assert dist.is_initialized()
 
-    log_dist(f'Creating expert and data parallel groups with size {ep_size}', ranks=[0])
-    world_size = torch.distributed.get_world_size()
-    rank = torch.distributed.get_rank()
+    log_dist(
+        f'Creating expert and data parallel groups with size {expert_parallel_size_}',
+        ranks=[0])
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
 
-    expert_parallel_size_ = min(ep_size, world_size)
     _ensure_divisibility(world_size, expert_parallel_size_)
 
     group_name = f"ep_size_{expert_parallel_size_}"
@@ -135,7 +138,7 @@ def _create_expert_and_data_parallel(ep_size):
     if group_name not in _EXPERT_DATA_PARALLEL_GROUP:
         for i in range(expert_parallel_size_):
             ranks = range(i, world_size, expert_parallel_size_)
-            group = torch.distributed.new_group(ranks)
+            group = dist.new_group(ranks)
             log_dist(
                 f'Creating expert data parallel process group named {group_name} with ranks: {list(ranks)}',
                 [0])
@@ -149,7 +152,7 @@ def _create_expert_and_data_parallel(ep_size):
     if group_name not in _EXPERT_PARALLEL_GROUP:
         for i in range(world_size // expert_parallel_size_):
             ranks = range(i * expert_parallel_size_, (i + 1) * expert_parallel_size_)
-            group = torch.distributed.new_group(ranks)
+            group = dist.new_group(ranks)
             log_dist(
                 f'creating expert parallel process group named {group_name} with ranks: {list(ranks)}',
                 [0])
@@ -218,14 +221,20 @@ def _create_expert_data_and_model_parallel(expert_parallel_size_, mpu):
         expert_parallel_group = [0,2,4,6], [8,10,12,14]             [1,3,5,7], [9,11,13,15]
         expert_data_parallel_group = [0,8],[2,10],[4,12],[6,14],    [1,9],[3,11],[5,13],[7,15]
     """
-    assert torch.distributed.is_initialized(), "torch distributed is not initialized"
+    assert dist.is_initialized(), "dist is not initialized"
     model_parallel_size_ = mpu.get_model_parallel_world_size()
 
-    world_size = torch.distributed.get_world_size()
-    rank = torch.distributed.get_rank()
+    global expert_tensor_parallel_world_size
+    expert_tensor_parallel_world_size = model_parallel_size_
+
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
     dp_world_size = mpu.get_data_parallel_world_size()
     dp_rank = mpu.get_data_parallel_rank()
 
+    _ensure_divisibility(world_size, model_parallel_size_)
+    _ensure_divisibility(dp_world_size, expert_parallel_size_)
+
     log_dist(
         f"Creating deepspeed groups with model parallel size {model_parallel_size_}, expert parallel size {expert_parallel_size_}, world size {world_size}, dp world size {dp_world_size}",
         [0])
@@ -236,9 +245,6 @@ def _create_expert_data_and_model_parallel(expert_parallel_size_, mpu):
     _DATA_PARALLEL_GROUP = mpu.get_data_parallel_group()
     _MODEL_PARALLEL_GROUP = mpu.get_model_parallel_group()
 
-    expert_parallel_size_ = min(expert_parallel_size_, dp_world_size)
-    _ensure_divisibility(world_size, expert_parallel_size_)
-
     group_name = f"ep_size_{expert_parallel_size_}"
 
     # Only create groups if they don't already exist
@@ -247,12 +253,12 @@ def _create_expert_data_and_model_parallel(expert_parallel_size_, mpu):
         expert_parallel_groups, expert_data_parallel_groups = _get_expert_parallel_ranks(
             world_size, model_parallel_size_, expert_parallel_size_)
         for ranks in expert_parallel_groups:
-            group = torch.distributed.new_group(ranks)
+            group = dist.new_group(ranks)
             if rank in list(ranks):
                 _EXPERT_PARALLEL_GROUP[group_name] = group
 
         for ranks in expert_data_parallel_groups:
-            group = torch.distributed.new_group(ranks)
+            group = dist.new_group(ranks)
             if rank in list(ranks):
                 _EXPERT_DATA_PARALLEL_GROUP[group_name] = group
 
@@ -304,66 +310,64 @@ def _get_expert_data_parallel_group_dict():
 
 def _clone_world_group():
     """Create a clone of the world group
-        Note: We need to clone the torch.distributed world group because we
-        use _get_global_rank() utility function in DeepSpeed at many places.
-        As that function does not work on torch.distributed.group.WORLD, we
+        Note: We need to clone the dist world group because we
+        use dist.get_global_rank() utility function in DeepSpeed at many places.
+        As that function does not work on dist.group.WORLD, we
         need to keep a clone of it.
     """
-    assert torch.distributed.is_initialized(), "torch.distributed is not initialized"
+    assert dist.is_initialized(), "dist is not initialized"
     global _WORLD_GROUP
     if _WORLD_GROUP is None:
         # If not cloned already, clone the world group
-        _WORLD_GROUP = torch.distributed.new_group(
-            ranks=range(torch.distributed.get_world_size()))
+        _WORLD_GROUP = dist.new_group(ranks=range(dist.get_world_size()))
     return _WORLD_GROUP
 
 
 def _get_data_parallel_group():
     """Get the data parallel group the caller rank belongs to."""
-    assert torch.distributed.is_initialized(), \
-        'torch.distributed is not initialized'
+    assert dist.is_initialized(), \
+        'dist is not initialized'
     global mpu
     if mpu is not None:
         return mpu.get_data_parallel_group()
-    # Return the clone of torch.distributed world group
+    # Return the clone of dist world group
     return _clone_world_group()
 
 
 def _get_broadcast_src_rank():
-    return _get_global_rank(_get_data_parallel_group(), 0)
+    return dist.get_global_rank(_get_data_parallel_group(), 0)
 
 
 def _get_expert_broadcast_src_rank(group_name):
-    return _get_global_rank(_get_expert_data_parallel_group(group_name), 0)
+    return dist.get_global_rank(_get_expert_data_parallel_group(group_name), 0)
 
 
 def _get_expert_parallel_world_size(group_name):
     """Return world size for the expert parallel group."""
-    return torch.distributed.get_world_size(group=_get_expert_parallel_group(group_name))
+    return dist.get_world_size(group=_get_expert_parallel_group(group_name))
 
 
 def _get_expert_data_parallel_world_size(group_name):
     """Return world size for the expert data parallel group."""
-    return torch.distributed.get_world_size(
-        group=_get_expert_data_parallel_group(group_name))
+    return dist.get_world_size(group=_get_expert_data_parallel_group(group_name))
 
 
 def _get_expert_parallel_rank(group_name):
     """Return my rank for the expert parallel group."""
-    return torch.distributed.get_rank(group=_get_expert_parallel_group(group_name))
+    return dist.get_rank(group=_get_expert_parallel_group(group_name))
 
 
 def _get_expert_parallel_src_rank(group_name):
     """Calculate the global rank corresponding to a local rank zero
     in the expert parallel group."""
-    global_rank = torch.distributed.get_rank()
+    global_rank = dist.get_rank()
     local_world_size = _get_expert_parallel_world_size(group_name)
     return (global_rank // local_world_size) * local_world_size
 
 
 def _get_expert_data_parallel_rank(group_name):
     """Return my rank for the expert data parallel group."""
-    return torch.distributed.get_rank(group=_get_expert_data_parallel_group(group_name))
+    return dist.get_rank(group=_get_expert_data_parallel_group(group_name))
 
 
 def _get_data_parallel_world_size():
@@ -371,7 +375,7 @@ def _get_data_parallel_world_size():
     global mpu
     if mpu is not None:
         return mpu.get_data_parallel_world_size()
-    return torch.distributed.get_world_size(group=_get_data_parallel_group())
+    return dist.get_world_size(group=_get_data_parallel_group())
 
 
 def _get_model_parallel_world_size():
@@ -387,4 +391,9 @@ def _get_data_parallel_rank():
     global mpu
     if mpu is not None:
         return mpu.get_data_parallel_rank()
-    return torch.distributed.get_rank(group=_get_data_parallel_group())
+    return dist.get_rank(group=_get_data_parallel_group())
+
+
+def _get_expert_model_parallel_world_size():
+    global expert_tensor_parallel_world_size
+    return expert_tensor_parallel_world_size
diff --git a/deepspeed/utils/init_on_device.py b/deepspeed/utils/init_on_device.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b51efe603840deef09269387b227304ae52bcb9
--- /dev/null
+++ b/deepspeed/utils/init_on_device.py
@@ -0,0 +1,81 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+import torch
+from typing import Callable
+from torch import Tensor
+from packaging import version as pkg_version
+
+
+class OnDevice(object):
+    """
+    Create modules/tensors w. specific devices and dtypes. Examples:
+
+    Create MyModule which consists of many different sub-modules and parameters. In this case we can create
+    MyModule as a collection of 'meta' tensors by passing `device='meta'` or we can create the module _directly_
+    on a CUDA device by passing `device=f'cuda:{local_rank}'` (where `local_rank` is the local GPU id.
+
+    with OnDevice(dtype=torch.float16, device='meta'):
+        model = MyModel()
+
+    with OnDevice(dtype=torch.float16, device=f'cuda:{local_rank}'):
+        model = MyModel()
+
+    """
+
+    _orig_torch_empty = torch.empty
+    _orig_torch_zeros = torch.zeros
+    _orig_torch_ones = torch.ones
+    _orig_torch_full = torch.full
+
+    def __init__(self, dtype, device="meta", enabled=True):
+        self.dtype = dtype
+        self.enabled = enabled
+        self.device = device
+
+        if device == "meta":
+            if pkg_version.parse('1.10') > pkg_version.parse(torch.__version__):
+                raise NotImplementedError(
+                    "Meta tensor support is not available, please upgrade to torch 1.10+"
+                )
+
+    def fp_tensor_constructor(self,
+                              fn: Callable,
+                              target_fp_dtype: torch.dtype) -> Callable:
+        def wrapped_fn(*args, **kwargs) -> Tensor:
+            if kwargs.get("device", None) is None:
+                kwargs['device'] = self.device
+            tensor: Tensor = fn(*args, **kwargs)
+            if tensor.is_floating_point():
+                tensor = tensor.to(target_fp_dtype)
+            return tensor
+
+        return wrapped_fn
+
+    def get_new_tensor_fn_for_dtype(self, dtype: torch.dtype) -> Callable:
+        def new_tensor(cls, *args) -> Tensor:
+            tensor = OnDevice._orig_torch_empty(0, device=self.device).new_empty(*args)
+            if tensor.is_floating_point():
+                tensor = tensor.to(dtype)
+            return tensor
+
+        return new_tensor
+
+    def __enter__(self):
+        if not self.enabled:
+            return
+        torch.Tensor.__old_new__ = torch.Tensor.__new__
+        torch.Tensor.__new__ = self.get_new_tensor_fn_for_dtype(self.dtype)
+        torch.empty = self.fp_tensor_constructor(self._orig_torch_empty, self.dtype)
+        torch.zeros = self.fp_tensor_constructor(self._orig_torch_zeros, self.dtype)
+        torch.ones = self.fp_tensor_constructor(self._orig_torch_ones, self.dtype)
+        torch.full = self.fp_tensor_constructor(self._orig_torch_full, self.dtype)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        if not self.enabled:
+            return
+        torch.Tensor.__new__ = torch.Tensor.__old_new__
+        torch.empty = self._orig_torch_empty
+        torch.zeros = self._orig_torch_zeros
+        torch.ones = self._orig_torch_ones
+        torch.full = self._orig_torch_full
diff --git a/deepspeed/utils/logging.py b/deepspeed/utils/logging.py
index 3a089b3dd1b8b37fc256d81b895de4cd11b3f0cd..631ee2177c65d37af153b314a54595a19ddc927d 100644
--- a/deepspeed/utils/logging.py
+++ b/deepspeed/utils/logging.py
@@ -1,9 +1,9 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import logging
 import sys
 import os
 
-import torch.distributed as dist
-
 log_levels = {
     "debug": logging.DEBUG,
     "info": logging.INFO,
@@ -46,7 +46,15 @@ class LoggerFactory:
 logger = LoggerFactory.create_logger(name="DeepSpeed", level=logging.INFO)
 
 
+def print_configuration(args, name):
+    logger.info("{}:".format(name))
+    for arg in sorted(vars(args)):
+        dots = "." * (29 - len(arg))
+        logger.info("  {} {} {}".format(arg, dots, getattr(args, arg)))
+
+
 def log_dist(message, ranks=None, level=logging.INFO):
+    from deepspeed import comm as dist
     """Log message when one of following condition meets
 
     + not dist.is_initialized()
@@ -70,6 +78,7 @@ def log_dist(message, ranks=None, level=logging.INFO):
 
 
 def print_json_dist(message, ranks=None, path=None):
+    from deepspeed import comm as dist
     """Print message when one of following condition meets
 
     + not dist.is_initialized()
diff --git a/deepspeed/utils/mixed_precision_linkage.py b/deepspeed/utils/mixed_precision_linkage.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac785ed9d2bf5aa2b5e4ed83a89326920534e0ca
--- /dev/null
+++ b/deepspeed/utils/mixed_precision_linkage.py
@@ -0,0 +1,58 @@
+"""
+Copyright 2022 The Microsoft DeepSpeed Team
+"""
+import types
+from deepspeed.utils import get_full_hp_param, get_full_hp_grad, get_hp_fragment_mapping
+
+
+def link_hp_params(lp_param_list,
+                   flat_hp_partition,
+                   gradient_dict,
+                   offload_gradient_dict,
+                   use_offload,
+                   param_group_index,
+                   partition_start,
+                   partition_size,
+                   partition_optimizer_state,
+                   dp_group):
+    local_lp_param_and_offset = _init_lp_to_hp_mapping(lp_param_list,
+                                                       partition_start,
+                                                       partition_size,
+                                                       dp_group)
+
+    for lp_param, lp_start in local_lp_param_and_offset:
+        lp_param._hp_mapping = get_hp_fragment_mapping(lp_param,
+                                                       lp_start,
+                                                       flat_hp_partition,
+                                                       gradient_dict,
+                                                       offload_gradient_dict,
+                                                       use_offload,
+                                                       param_group_index,
+                                                       partition_start,
+                                                       partition_size,
+                                                       partition_optimizer_state)
+
+
+def _init_lp_to_hp_mapping(lp_param_list, partition_start, partition_size, dp_group):
+    current_offset = 0
+    param_and_offset_list = []
+    partition_end = partition_start + partition_size
+    index_in_param_group = 0
+    for i, lp_param in enumerate(lp_param_list):
+        lp_param._hp_mapping = None
+        lp_param._dp_group = dp_group
+        lp_param.get_full_hp_param = types.MethodType(get_full_hp_param, lp_param)
+        lp_param.get_full_hp_grad = types.MethodType(get_full_hp_grad, lp_param)
+
+        # lp_param overlaps with partition if both are true
+        # 1) current_offset < partition_end,
+        # 2) current_offset + lp_param.numel() >= partition_start
+        lp_param_end = current_offset + lp_param.numel()
+        if current_offset < partition_end and lp_param_end > partition_start:
+            param_and_offset_list.append((lp_param, current_offset))
+            lp_param._index_in_param_group = index_in_param_group
+            # Indices for params in this partition/GPU
+            index_in_param_group += 1
+        current_offset += lp_param.numel()
+
+    return param_and_offset_list
diff --git a/deepspeed/utils/nvtx.py b/deepspeed/utils/nvtx.py
index 11579ffbb17f1338d7717fe0eafa390ce7e0fea0..881000311c36ab4d4f765e90acd022e01dc86a7f 100644
--- a/deepspeed/utils/nvtx.py
+++ b/deepspeed/utils/nvtx.py
@@ -1,15 +1,15 @@
-import torch
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from deepspeed.accelerator import get_accelerator
 
 
 def instrument_w_nvtx(func):
     """decorator that causes an NVTX range to be recorded for the duration of the
     function call."""
-    if hasattr(torch.cuda.nvtx, "range"):
-
-        def wrapped_fn(*args, **kwargs):
-            with torch.cuda.nvtx.range(func.__qualname__):
-                return func(*args, **kwargs)
+    def wrapped_fn(*args, **kwargs):
+        get_accelerator().range_push(func.__qualname__)
+        ret_val = func(*args, **kwargs)
+        get_accelerator().range_pop()
+        return ret_val
 
-        return wrapped_fn
-    else:
-        return func
+    return wrapped_fn
diff --git a/deepspeed/utils/tensor_fragment.py b/deepspeed/utils/tensor_fragment.py
new file mode 100644
index 0000000000000000000000000000000000000000..8204735748c5ab7a1b9f71ef8405983644fcafae
--- /dev/null
+++ b/deepspeed/utils/tensor_fragment.py
@@ -0,0 +1,284 @@
+"""
+Copyright 2022 The Microsoft DeepSpeed Team
+"""
+
+import torch
+from dataclasses import dataclass
+from deepspeed import comm as dist
+
+
+@dataclass
+class fragment_address:
+    numel: int
+    start: int
+
+
+@dataclass
+class tensor_fragment:
+    lp_fragment: torch.Tensor
+    lp_fragment_address: fragment_address
+    hp_fragment: torch.Tensor
+    hp_fragment_address: fragment_address
+    optim_fragment: {}
+    gradient_dict: {}
+    offload_gradient_dict: {}
+    use_offload: bool
+    param_group_index: int
+
+    def update_hp(self):
+        self.hp_fragment.data.copy_(self.lp_fragment.data)
+
+    def update_lp(self):
+        self.lp_fragment.data.copy_(self.hp_fragment.data)
+
+    def get_optim_state_fragment(self, key):
+        if key in self.optim_fragment:
+            return self.optim_fragment[key]
+        else:
+            raise ValueError(f'{key} not found in optimizer state fragment')
+
+    def get_hp_fragment_address(self):
+        return self.hp_fragment_address
+
+    def get_optim_state_keys(self):
+        return list(self.optim_fragment.keys())
+
+
+def get_full_hp_param(self, optim_state_key=None):
+    reduce_buffer = torch.zeros_like(self, dtype=torch.float32).flatten()
+    if self._hp_mapping is not None:
+        lp_frag_address = self._hp_mapping.lp_fragment_address
+        reduce_fragment = torch.narrow(reduce_buffer,
+                                       0,
+                                       lp_frag_address.start,
+                                       lp_frag_address.numel)
+        if optim_state_key is None:
+            hp_fragment = self._hp_mapping.hp_fragment
+        else:
+            hp_fragment = self._hp_mapping.get_optim_state_fragment(optim_state_key)
+
+        reduce_fragment.data.copy_(hp_fragment.data)
+    dist.all_reduce(reduce_buffer, group=self._dp_group)
+    return reduce_buffer.reshape_as(self)
+
+
+def get_full_hp_grad(self):
+    reduce_buffer = torch.zeros_like(self, dtype=torch.float32).flatten()
+    if self._hp_mapping is not None:
+        hp_mapping = self._hp_mapping
+
+        if hp_mapping.use_offload:
+            gradient_dict = hp_mapping.offload_gradient_dict
+        else:
+            gradient_dict = hp_mapping.gradient_dict
+
+        if hp_mapping.param_group_index not in gradient_dict or gradient_dict[
+                hp_mapping.param_group_index] is None:
+            raise ValueError(
+                "Gradients are only available immediately after backward and before engine step"
+            )
+
+        lp_grad_fragment = gradient_dict[hp_mapping.param_group_index][
+            self._index_in_param_group]
+        hp_grad_fragment = lp_grad_fragment.to(torch.float32).flatten()
+
+        lp_frag_address = self._hp_mapping.lp_fragment_address
+        reduce_fragment = torch.narrow(reduce_buffer,
+                                       0,
+                                       lp_frag_address.start,
+                                       lp_frag_address.numel)
+
+        if self.view(-1).shape == hp_grad_fragment.shape:
+            reduce_buffer.data.copy_(hp_grad_fragment.data)
+        else:
+            reduce_fragment.data.copy_(hp_grad_fragment.data)
+
+    dist.all_reduce(reduce_buffer, group=self._dp_group)
+    return reduce_buffer.reshape_as(self)
+
+
+def safe_get_full_fp32_param(param):
+    """Assemble and return the fp32 parameter of a low-precision (e.g., fp16) parameter.
+
+        Args:
+            param (``torch.nn.Parameter``): A model parameter
+    """
+    # ZeRO stage 3 param
+    if hasattr(param, 'ds_id'):
+        return param._z3_optimizer.get_full_hp_param(param)
+
+    # ZeRO stage 1, 2, and bf16_optimizer params
+    if hasattr(param, '_hp_mapping'):
+        return param.get_full_hp_param()
+    return None
+
+
+def safe_get_full_optimizer_state(param, optim_state_key):
+    """Assemble and return the fp32 optimizer state of a low-precision (e.g., fp16) parameter.
+
+        Args:
+            param (``torch.nn.Parameter``): A model parameter
+    """
+    # ZeRO stage 3 param
+    if hasattr(param, 'ds_id'):
+        return param._z3_optimizer.get_full_hp_param(param, optim_state_key)
+
+    # ZeRO stage 1, 2, and bf16_optimizer params
+    if hasattr(param, '_hp_mapping'):
+        return param.get_full_hp_param(optim_state_key)
+    return None
+
+
+# TODO: Figure out the correct return dtype
+def safe_get_full_grad(param):
+    """Assemble and return the fp32 gradient of a low-precision (e.g., fp16) parameter.
+
+        Args:
+            param (``torch.nn.Parameter``): A model parameter
+    """
+    if param.grad is not None:
+        return param.grad
+
+    # ZeRO stage 3 param
+    if hasattr(param, 'ds_id'):
+        return param._z3_optimizer.get_fp32_grad_for_param(param)
+
+    # ZeRO stage 1, 2, and bf16_optimizer params
+    if hasattr(param, '_hp_mapping'):
+        return param.get_full_hp_grad()
+
+    return None
+
+
+def get_hp_fragment_mapping(lp_param,
+                            lp_start,
+                            flat_hp_partition,
+                            gradient_dict,
+                            offload_gradient_dict,
+                            use_offload,
+                            param_group_index,
+                            partition_start,
+                            partition_size,
+                            optimizer_state_dict):
+    lp_end = lp_param.numel() + lp_start
+    hp_start = partition_start
+    hp_end = partition_start + partition_size
+
+    fragment_start = max(lp_start, hp_start)
+    fragment_end = min(lp_end, hp_end)
+    assert fragment_start < fragment_end, \
+        f'fragment start {fragment_start} should be < fragment_end {fragment_end}'
+
+    fragment_numel = fragment_end - fragment_start
+    hp_frag_address = fragment_address(start=fragment_start - hp_start,
+                                       numel=fragment_numel)
+    hp_fragment_tensor = flat_hp_partition.narrow(0,
+                                                  hp_frag_address.start,
+                                                  hp_frag_address.numel)
+    optim_fragment = {
+        key: value.narrow(0,
+                          hp_frag_address.start,
+                          hp_frag_address.numel)
+        for key,
+        value in optimizer_state_dict.items()
+        if torch.is_tensor(value) and value.shape == flat_hp_partition.shape
+    }
+
+    lp_frag_address = fragment_address(start=fragment_start - lp_start,
+                                       numel=fragment_numel)
+    lp_fragment_tensor = lp_param.flatten().narrow(0,
+                                                   lp_frag_address.start,
+                                                   lp_frag_address.numel)
+
+    return tensor_fragment(lp_fragment=lp_fragment_tensor,
+                           lp_fragment_address=lp_frag_address,
+                           hp_fragment=hp_fragment_tensor,
+                           hp_fragment_address=hp_frag_address,
+                           optim_fragment=optim_fragment,
+                           gradient_dict=gradient_dict,
+                           offload_gradient_dict=offload_gradient_dict,
+                           use_offload=use_offload,
+                           param_group_index=param_group_index)
+
+
+'''
+Logic for lp_param to hp_param mapping
+
+lp      lp0 lp1 lp2         lp3  lp4            <-------  indices/names
+lp      [  ][  ][          ][   ][         ]    <-------- tensors
+flat_lp [                                  ]     <-------- flat lp params
+flat_hp            [                 ]   <------------------ flat hp partition on current rank
+full_hp [                                        ] <------- full flat hp params
+
+
+lp2
+ full numel = 16
+ lp_frag
+   numel = 12
+   frag_start = 3
+   frag_end  = 15
+ hp_frag
+    numel = 12
+    frag_start = 0
+    frag_end = 11
+
+ hp_frag.copy_(lp_frag)
+
+
+lp3:
+  full numel = 4
+  lp_frag
+     numel = 4
+     start = 0
+     end = 3
+  hp_frag
+     numel = 4
+     start = 12
+     end = 15
+
+
+lp4:
+   full numel = 12
+   lp_frag
+     numel = 4
+     start = 0
+     end = 3
+  hp_frag
+     numel = 4
+     start = 16
+     end = 19
+
+
+
+Visual depiction of above
+lp              {         }
+flat_lp [                                ]
+flat_hp            (                 )
+
+
+flat_lp [       {  (      }          )   ]
+                lx  hx   ly          hy
+                    ly-hx
+
+
+lp                             {       }
+flat_lp [                                ]
+flat_hp            (                 )
+
+
+flat_lp [          (            {     ) }  ]
+                   hx           lx   hy ly
+                                   hy-lx
+
+lp                        {   }
+flat_lp [                                ]
+flat_hp            (                 )
+
+
+flat_lp [          (       {   }      )   ]
+                   hx      lx  ly    hy
+                             ly-lx
+
+lp -> (lx, hy)
+flat_hp -> (hx, hy)
+'''
diff --git a/deepspeed/utils/timer.py b/deepspeed/utils/timer.py
old mode 100644
new mode 100755
index 75b9bcccb4132ba7afa4411069c1ab93e10978f6..16865c5e56c2a5d1a27744a262407817d9b0cb09
--- a/deepspeed/utils/timer.py
+++ b/deepspeed/utils/timer.py
@@ -2,14 +2,11 @@
 Copyright 2019 The Microsoft DeepSpeed Team
 """
 
-from numpy.core.numeric import count_nonzero
-from deepspeed.elasticity.elasticity import compute_elastic_config
 import time
-import torch
 from numpy import mean
 from deepspeed.utils.logging import log_dist
-
-from deepspeed.utils import logger
+from deepspeed.accelerator import get_accelerator
+from deepspeed import comm as dist
 
 try:
     import psutil
@@ -21,12 +18,14 @@ except ImportError:
 
 
 class CudaEventTimer(object):
-    def __init__(self, start_event: torch.cuda.Event, end_event: torch.cuda.Event):
+    def __init__(self,
+                 start_event: get_accelerator().Event,
+                 end_event: get_accelerator().Event):
         self.start_event = start_event
         self.end_event = end_event
 
     def get_elapsed_msec(self):
-        torch.cuda.current_stream().wait_event(self.end_event)
+        get_accelerator().current_stream().wait_event(self.end_event)
         self.end_event.synchronize()
         return self.start_event.elapsed_time(self.end_event)
 
@@ -44,15 +43,15 @@ class SynchronizedWallClockTimer:
 
         def start(self):
             """Start the timer."""
-            assert not self.started_, f"{self.name} timer has already been started"
-            self.start_event = torch.cuda.Event(enable_timing=True)
+            assert not self.started_, f"{self.name_} timer has already been started"
+            self.start_event = get_accelerator().Event(enable_timing=True)
             self.start_event.record()
             self.started_ = True
 
         def stop(self, reset=False, record=False):
             """Stop the timer."""
             assert self.started_, "timer is not started"
-            end_event = torch.cuda.Event(enable_timing=True)
+            end_event = get_accelerator().Event(enable_timing=True)
             end_event.record()
             self.event_timers.append(CudaEventTimer(self.start_event, end_event))
             self.start_event = None
@@ -87,11 +86,15 @@ class SynchronizedWallClockTimer:
             return elapsed_
 
         def mean(self):
+            self.elapsed(reset=False)
             return trim_mean(self.elapsed_records, 0.1)
 
     def __init__(self):
         self.timers = {}
 
+    def get_timers(self):
+        return self.timers
+
     def __call__(self, name):
         if name not in self.timers:
             self.timers[name] = self.Timer(name)
@@ -99,20 +102,20 @@ class SynchronizedWallClockTimer:
 
     @staticmethod
     def memory_usage():
-        alloc = "mem_allocated: {:.4f} GB".format(torch.cuda.memory_allocated() /
+        alloc = "mem_allocated: {:.4f} GB".format(get_accelerator().memory_allocated() /
                                                   (1024 * 1024 * 1024))
         max_alloc = "max_mem_allocated: {:.4f} GB".format(
-            torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024))
-        cache = "cache_allocated: {:.4f} GB".format(torch.cuda.memory_cached() /
+            get_accelerator().max_memory_allocated() / (1024 * 1024 * 1024))
+        cache = "cache_allocated: {:.4f} GB".format(get_accelerator().memory_cached() /
                                                     (1024 * 1024 * 1024))
         max_cache = "max_cache_allocated: {:.4f} GB".format(
-            torch.cuda.max_memory_cached() / (1024 * 1024 * 1024))
+            get_accelerator().max_memory_cached() / (1024 * 1024 * 1024))
         return " | {} | {} | {} | {}".format(alloc, max_alloc, cache, max_cache)
 
     def log(self, names, normalizer=1.0, reset=True, memory_breakdown=False, ranks=None):
         """Log a group of timers."""
         assert normalizer > 0.0
-        string = f"rank={torch.distributed.get_rank()} time (ms)"
+        string = f"rank={dist.get_rank()} time (ms)"
         for name in names:
             if name in self.timers:
                 elapsed_time = (self.timers[name].elapsed(reset=reset) / normalizer)
@@ -135,24 +138,22 @@ class ThroughputTimer:
     def __init__(
         self,
         batch_size,
-        num_workers,
         start_step=2,
         steps_per_output=50,
         monitor_memory=False,
         logging_fn=None,
     ):
+        from deepspeed.utils import logger
         self.start_time = 0
         self.end_time = 0
         self.started = False
-        self.batch_size = batch_size
-        if batch_size is None:
-            self.batch_size = 1
-        self.num_workers = num_workers
+        self.batch_size = 1 if batch_size is None else batch_size
         self.start_step = start_step
         self.epoch_count = 0
-        self.local_step_count = 0
-        self.total_step_count = 0
+        self.micro_step_count = 0
+        self.global_step_count = 0
         self.total_elapsed_time = 0
+        self.step_elapsed_time = 0
         self.steps_per_output = steps_per_output
         self.monitor_memory = monitor_memory
         self.logging = logging_fn
@@ -165,7 +166,7 @@ class ThroughputTimer:
 
     def update_epoch_count(self):
         self.epoch_count += 1
-        self.local_step_count = 0
+        self.micro_step_count = 0
 
     def _init_timer(self):
         self.initialized = True
@@ -173,49 +174,60 @@ class ThroughputTimer:
     def start(self):
         self._init_timer()
         self.started = True
-        if self.total_step_count >= self.start_step:
-            torch.cuda.synchronize()
+        if self.global_step_count >= self.start_step:
+            get_accelerator().synchronize()
             self.start_time = time.time()
 
-    def stop(self, report_speed=True):
+    def stop(self, global_step=False, report_speed=True):
         if not self.started:
             return
         self.started = False
-        self.total_step_count += 1
-        self.local_step_count += 1
-        if self.total_step_count > self.start_step:
-            torch.cuda.synchronize()
+        self.micro_step_count += 1
+        if global_step:
+            self.global_step_count += 1
+
+        if self.start_time > 0:
+            get_accelerator().synchronize()
             self.end_time = time.time()
             duration = self.end_time - self.start_time
             self.total_elapsed_time += duration
-            if self.local_step_count % self.steps_per_output == 0:
-                if report_speed:
+            self.step_elapsed_time += duration
+
+            if global_step:
+                if report_speed and self.global_step_count % self.steps_per_output == 0:
                     self.logging(
-                        "{}/{}, SamplesPerSec={}, MemAllocated={}GB, MaxMemAllocated={}GB"
-                        .format(self.epoch_count,
-                                self.local_step_count,
-                                self.avg_samples_per_sec(),
-                                round(torch.cuda.memory_allocated() / 1024**3,
-                                      2),
-                                round(torch.cuda.max_memory_allocated() / 1024**3,
-                                      2)))
-                if self.monitor_memory:
-                    virt_mem = psutil.virtual_memory()
-                    swap = psutil.swap_memory()
-                    self.logging("{}/{}, vm percent: {}, swap percent: {}".format(
-                        self.epoch_count,
-                        self.local_step_count,
-                        virt_mem.percent,
-                        swap.percent,
-                    ))
+                        "epoch={}/micro_step={}/global_step={}, RunningAvgSamplesPerSec={}, CurrSamplesPerSec={}, "
+                        "MemAllocated={}GB, MaxMemAllocated={}GB".format(
+                            self.epoch_count,
+                            self.micro_step_count,
+                            self.global_step_count,
+                            self.avg_samples_per_sec(),
+                            self.batch_size / self.step_elapsed_time,
+                            round(get_accelerator().memory_allocated() / 1024**3,
+                                  2),
+                            round(get_accelerator().max_memory_allocated() / 1024**3,
+                                  2),
+                        ))
+                    if self.monitor_memory:
+                        virt_mem = psutil.virtual_memory()
+                        swap = psutil.swap_memory()
+                        self.logging(
+                            "epoch={}/micro_step={}/global_step={}, vm %: {}, swap %: {}"
+                            .format(
+                                self.epoch_count,
+                                self.micro_step_count,
+                                self.global_step_count,
+                                virt_mem.percent,
+                                swap.percent,
+                            ))
+                self.step_elapsed_time = 0
 
     def avg_samples_per_sec(self):
-        if self.total_step_count > 0:
-            samples_per_step = self.batch_size * self.num_workers
-            total_step_offset = self.total_step_count - self.start_step
+        if self.global_step_count > 0:
+            total_step_offset = self.global_step_count - self.start_step
             avg_time_per_step = self.total_elapsed_time / total_step_offset
             # training samples per second
-            return samples_per_step / avg_time_per_step
+            return self.batch_size / avg_time_per_step
         return float("-inf")
 
 
@@ -231,6 +243,9 @@ def trim_mean(data, trim_percent):
     """
     assert trim_percent >= 0.0 and trim_percent <= 1.0
     n = len(data)
+    # Account for edge case of empty list
+    if len(data) == 0:
+        return 0
     data.sort()
     k = int(round(n * (trim_percent)))
     return mean(data[k:n - k])
diff --git a/deepspeed/utils/types.py b/deepspeed/utils/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6d5ffd24b8f31a4247f18fe80e4d1e945467772
--- /dev/null
+++ b/deepspeed/utils/types.py
@@ -0,0 +1,9 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from enum import IntEnum
+
+
+class ActivationFuncType(IntEnum):
+    UNKNOWN = 0
+    GELU = 1
+    ReLU = 2
diff --git a/deepspeed/utils/zero_to_fp32.py b/deepspeed/utils/zero_to_fp32.py
old mode 100644
new mode 100755
index 8e8b0fd17dd18a28a5cd7e92c9829fd165e93bc4..f00e256bb7879727ab1d785173f4aac6967876da
--- a/deepspeed/utils/zero_to_fp32.py
+++ b/deepspeed/utils/zero_to_fp32.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+'''Copyright The Microsoft DeepSpeed Team'''
 
 # This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
 # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
@@ -17,11 +18,9 @@ from collections import OrderedDict
 
 # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
 # DeepSpeed data structures it has to be available in the current python environment.
-import deepspeed
 from deepspeed.utils import logger
 from deepspeed.checkpoint.constants import (DS_VERSION,
                                             OPTIMIZER_STATE_DICT,
-                                            PARAM_SHAPES,
                                             SINGLE_PARTITION_OF_FP32_GROUPS,
                                             FP32_FLAT_GROUPS,
                                             ZERO_STAGE,
diff --git a/docker/Dockerfile b/docker/Dockerfile
old mode 100644
new mode 100755
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 5e3f756dcd35833132ccff071139e0e7769cf4ab..cdfe5e85db4e1472027830d81707ff7772571dc2 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -1,4 +1,4 @@
-FROM rocm/pytorch:rocm5.0.1_ubuntu18.04_py3.7_pytorch_1.10.0
+FROM rocm/pytorch:latest
 
 # install latest released version of deepspeed
 RUN pip install deepspeed && \
diff --git a/docs/README.md b/docs/README.md
index 3af6830ab3b2ff3ebeca0b6c1d62d9dbd2e75b9c..fbd9b68ac20e3cc127c3809a96772bedd8921c3a 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -47,3 +47,13 @@ You can now start a local webserver via:
 bundle exec jekyll serve
 ```
 The website should now be accessible at [http://localhost:4000](http://localhost:4000)
+
+
+## Update the Readthedocs.io API documentation
+Use the following steps to update the public API documentation.
+
+1. Make your documentation changes and push them to the rtd-staging branch. This will rebuild the docs in the staging branch.
+**NOTE**: It is acceptable to force push to this branch to overwrite previous changes.
+2. View the result of the result of the build [here](https://readthedocs.org/projects/deepspeed/builds/)
+3. Once the build is complete view the newly modified API documentation [here](https://deepspeed.readthedocs.io/en/rtd-staging/)
+4. Once you are satisfied with the changes create a new branch off of rtd-staging to push into master.
diff --git a/docs/_config.yml b/docs/_config.yml
index dc79fc033b1a98362d0ab1dc73f5b2f25fad9507..7127b8459fe236df7ac1a2629517a652d559c4b1 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -35,10 +35,12 @@ collections:
       - advanced-install.md
       - getting-started.md
       - azure.md
+      - automatic-tensor-parallelism.md
       - bert-finetuning.md
       - bert-pretraining.md
       - cifar-10.md
       - curriculum-learning.md
+      - data-efficiency.md
       - flops-profiler.md
       - pytorch-profiler.md
       - autotuning.md
@@ -48,6 +50,9 @@ collections:
       - mixture-of-experts.md
       - mixture-of-experts-nlg.md
       - mixture-of-experts-inference.md
+      - model-compression.md
+      - monitor.md
+      - comms-logging.md
       - one-cycle.md
       - onebit-adam.md
       - zero-one-adam.md
@@ -78,6 +83,8 @@ defaults:
       path: "_pages"
     values:
       permalink: /docs/:basename/
+      toc: true
+      toc_label: "Contents"
   - scope:
       path: ""
       type: posts
diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml
old mode 100644
new mode 100755
index 20f00b66760f4030904e00bd5c14fb3bf84760ad..6f7c443c7958e9f3000f693eed340e4250d46589
--- a/docs/_data/navigation.yml
+++ b/docs/_data/navigation.yml
@@ -11,20 +11,15 @@ main:
     url: https://github.com/microsoft/DeepSpeed
 
 lnav:
-  - title: 'Feature Overview'
-    url: /features/
+  - title: 'Training'
+    url: /training/
+  - title: 'Inference'
+    url: /inference/
+  - title: 'Compression'
+    url: /compression/
   - title: 'Getting Started'
     url: /getting-started/
-    children:
-      - title: 'Installation'
-        url: /getting-started/#installation
-      - title: 'Writing models'
-        url: /getting-started/#writing-deepspeed-models
-      - title: 'Training'
-        url: /getting-started/#training
-      - title: 'Launching'
-        url: /getting-started/#launching-deepspeed-training
-  - title: 'Configuration'
+  - title: 'ds_config'
     url: /docs/config-json/
     children:
       - title: 'Autotuning'
@@ -33,36 +28,24 @@ lnav:
         url: /docs/config-json/#batch-size-related-parameters
       - title: 'Optimizer'
         url: /docs/config-json/#optimizer-parameters
-      - title: 'Scheduler'
-        url: /docs/config-json/#scheduler-parameters
-      - title: 'Communication'
-        url: /docs/config-json/#communication-options
       - title: 'FP16'
         url: /docs/config-json/#fp16-training-options
       - title: 'BFLOAT16'
         url: /docs/config-json/#bfloat16-training-options
-      - title: 'Gradient Clipping'
-        url: /docs/config-json/#gradient-clipping
       - title: 'ZeRO optimizations'
         url: /docs/config-json/#zero-optimizations-for-fp16-training
-      - title: 'Parameter Offloading'
-        url: /docs/config-json/#parameter-offloading
-      - title: 'Optimizer Offloading'
-        url: /docs/config-json/#optimizer-offloading
-      - title: 'Asynchronous I/O'
-        url: /docs/config-json/#asynchronous-io
       - title: 'Logging'
         url: /docs/config-json/#logging
       - title: 'Flops Profiler'
         url: /docs/config-json/#flops-profiler
-      - title: 'PyTorch Profiler'
-        url: /docs/config-json/#pytorch-profiler
-      - title: 'Activation checkpointing'
-        url: /docs/config-json/#activation-checkpointing
-      - title: 'Sparse Attention'
-        url: /docs/config-json/#sparse-attention
-      - title: 'Logging to TensorBoard'
-        url: /docs/config-json/#tensorboard-options
+      - title: 'Monitoring'
+        url: /docs/config-json/#monitoring-module-tensorboard-wandb-csv
+      - title: 'Communication Logging'
+        url: /docs/config-json/#communication-logging
+      - title: 'Model Compression'
+        url: /docs/config-json/#compression
+      - title: 'Data Efficiency'
+        url: /docs/config-json/#data-efficiency
   - title: 'Tutorials'
     url: /tutorials/
     children:
@@ -70,6 +53,8 @@ lnav:
         url: /getting-started/
       - title: 'Getting started on Azure'
         url: /tutorials/azure/
+      - title: 'Automatic Tensor Parallelism'
+        url: /tutorials/automatic-tensor-parallelism/
       - title: 'Autotuning'
         url: /tutorials/autotuning/
       - title: 'BingBertSQuAD Fine-tuning'
@@ -80,6 +65,8 @@ lnav:
         url: /tutorials/cifar-10/
       - title: 'Curriculum Learning'
         url: /tutorials/curriculum-learning/
+      - title: 'Data Efficiency'
+        url: /tutorials/data-efficiency/
       - title: 'Flops Profiler'
         url: /tutorials/flops-profiler/
       - title: 'PyTorch Profiler'
@@ -98,8 +85,14 @@ lnav:
         url: /tutorials/mixture-of-experts-nlg/
       - title: 'MoE Inference'
         url: /tutorials/mixture-of-experts-inference/
+      - title: 'Model Compression'
+        url: /tutorials/model-compression/
       - title: 'Mixture-of-Quantization'
         url: /tutorials/MoQ-tutorial/
+      - title: 'Monitoring'
+        url: /tutorials/monitor
+      - title: 'Communication Logging'
+        url: /tutorials/comms-logging
       - title: 'One-Cycle Schedule'
         url: /tutorials/one-cycle/
       - title: 'One-Bit Adam'
diff --git a/docs/_pages/compression.md b/docs/_pages/compression.md
new file mode 100644
index 0000000000000000000000000000000000000000..1a7b40d0cf1f217f310811bf9b7fb63741210ec2
--- /dev/null
+++ b/docs/_pages/compression.md
@@ -0,0 +1,12 @@
+---
+title: "Compression Overview and Features"
+layout: single
+permalink: /compression/
+toc: true
+toc_label: "Contents"
+---
+
+
+DeepSpeed Compression is a library purposely built to make it easy to compress models for researchers and practitioners while delivering faster speed, smaller model size, and significantly reduced compression cost. Please refer to our [blog](https://www.microsoft.com/en-us/research/blog/deepspeed-compression-a-composable-library-for-extreme-compression-and-zero-cost-quantization/) for more details.
+
+DeepSpeed Compression offers novel state-of-the-art compression techniques to achieve faster model compression with better model quality and lower compression cost. DeepSpeed Compression also takes an end-to-end approach to improve the computation efficiency of compressed models via a highly optimized inference engine. Furthermore, our library has multiple built-in state-of-the-art compression methods. It supports the synergistic composition of these methods and the system optimizations, offering the best of both worlds while allowing a seamless and easy-to-use pipeline for efficient DL model inference. We highly recommend you also to read our blog to learn more about (at a high level) why we build DeepSpeed Compression and what benefits it provides to users. To try compress your model using DeepSpeed compression library, please checkout our [tutorial](https://www.deepspeed.ai/tutorials/model-compression/).
diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
old mode 100644
new mode 100755
index 53df586ec3e6f00746f266c3d11aff94d3015fd1..2d497bb1b567f4f815fcdf4cf4f39abefc1cbb55
--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@@ -1,5 +1,7 @@
 ---
 title: "DeepSpeed Configuration JSON"
+toc: true
+toc_label: "Contents"
 ---
 
 ### Batch Size Related Parameters
@@ -106,6 +108,7 @@ A variant ***optimizer*** for 1-bit Adam is 0/1 Adam, which further optimizes 1-
   }
 ```
 0/1 Adam supports  the following params key/values in addition to standard Adam (learn more in our [tutorial](/tutorial/zero-one-adam/).)
+
 | "params" key        | Description                                                                        | Default |
 | ------------------- | ---------------------------------------------------------------------------------- | ------- |
 | var\_freeze\_step   | The latest step to update the variance                                             | 100000  |
@@ -216,8 +219,9 @@ Example of <i>**scheduler**</i>
 ```json
 "fp16": {
     "enabled": true,
+    "auto_cast": false,
     "loss_scale": 0,
-    "initial_scale_power": 32,
+    "initial_scale_power": 16,
     "loss_scale_window": 1000,
     "hysteresis": 2,
     "min_loss_scale": 1
@@ -230,6 +234,12 @@ Example of <i>**scheduler**</i>
 | ------------------------------------------------------------------------------------------- | ------- |
 | <i>**enabled**</i> is a **fp16** parameter indicating whether or not FP16 training enabled. | `false` |
 
+<i>**fp16:auto_cast**</i>: [boolean]
+
+| Description                                                  | Default |
+| -------------------------------------------------------------| ------- |
+| <i>**auto_cast**</i> automatically casts inputs to **fp16**  | `false` |
+
 <i>**fp16:loss_scale**</i>: [float]
 
 | Description                                                                                                                                                                                                                           | Default |
@@ -258,7 +268,7 @@ Example of <i>**scheduler**</i>
 
 | Description                                                                                           | Default |
 | ----------------------------------------------------------------------------------------------------- | ------- |
-| <i>**min_loss_scale**</i> is  a **fp16** parameter representing the minimum dynamic loss scale value. | `1000`  |
+| <i>**min_loss_scale**</i> is  a **fp16** parameter representing the minimum dynamic loss scale value. | `1`     |
 
 ### BFLOAT16 training options
 
@@ -363,7 +373,7 @@ Enabling and configuring ZeRO memory optimizations
 
 | Description                                                                                               | Default |
 | --------------------------------------------------------------------------------------------------------- | ------- |
-| Enable ZeRO memory optimization wrapper for FP16 Training. Currently compatible only with Adam optimizer. | `false` |
+| Enable ZeRO memory optimizations, compatible with FP16/BF16/FP32 and the Adam optimizer. | `false` |
 
 <i>**stage**</i>: [integer]
 
@@ -417,7 +427,7 @@ Enabling and configuring ZeRO memory optimizations
 
 | Description                                                                                                                                                                                                                                                                         | Default |
 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| Stage 2 optimization for CPU offloading that parallelizes gradient copying to CPU memory among ranks by fine-grained gradient partitioning. Performance benefit grows with gradient accumulation steps (more copying between optimizer steps) or GPU count (increased parallelism). | `False` |
+| Stage 1 and 2 optimization for CPU offloading that parallelizes gradient copying to CPU memory among ranks by fine-grained gradient partitioning. Performance benefit grows with gradient accumulation steps (more copying between optimizer steps) or GPU count (increased parallelism). | `False` |
 
 ***offload_param***: [dictionary]
 
@@ -429,7 +439,7 @@ Enabling and configuring ZeRO memory optimizations
 
 | Description                                                                                                                                                                                                                          | Default |
 | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------- |
-| Enable offloading of optimizer state to CPU or NVMe, and optimizer computation to CPU. This frees up GPU memory for larger models or batch sizes. Valid only with stage 2 and 3. See [here](#optimizer-offloading) for more details. | `False` |
+| Enable offloading of optimizer state to CPU or NVMe, and optimizer computation to CPU. This frees up GPU memory for larger models or batch sizes. Valid for ZeRO stage 1, 2, 3. See [here](#optimizer-offloading) for more details. | `False` |
 
 ***stage3_max_live_parameters***: [integer]
 
@@ -471,7 +481,7 @@ Enabling and configuring ZeRO memory optimizations
 
 | Description                                                                                                                                       | Default |
 | ------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| Enable offloading of optimizer memory and computation to CPU. This frees up GPU memory for larger models or batch sizes. Valid only with stage 2. | `False` |
+| Enable offloading of optimizer memory and computation to CPU. This frees up GPU memory for larger models or batch sizes. Valid with stage 1 and 2. | `False` |
 
 
 ### Parameter offloading
@@ -526,7 +536,7 @@ Note that if the value of "device" is not specified or not supported, an asserti
 | Number of parameter elements to maintain in CPU memory when offloading to NVMe is enabled. | 1e9     |
 
 ### Optimizer offloading
-Enabling and configuring ZeRO optimization of offloading optimizer computation to CPU and state to CPU/NVMe. CPU offloading is available with ZeRO stage 2 or 3. NVMe offloading is available only with ZeRO stage 3.
+Enabling and configuring ZeRO optimization of offloading optimizer computation to CPU and state to CPU/NVMe. CPU offloading is available with ZeRO stage 1, 2, 3. NVMe offloading is available only with ZeRO stage 3.
 Note that if the value of "device" is not specified or not supported, an assertion will be triggered.
 ```json
   "offload_optimizer": {
@@ -642,8 +652,8 @@ Configuring the asynchronous I/O module for offloading parameter and optimizer s
 {
   "autotuning": {
     "enabled": false,
-    "results_dir": null,
-    "exps_dir": null,
+    "results_dir": "autotuning_results",
+    "exps_dir": "autotuning_exps",
     "overwrite": false,
     "metric": "throughput",
     "start_profile_step": 3,
@@ -668,15 +678,15 @@ Configuring the asynchronous I/O module for offloading parameter and optimizer s
 
 <i>**results_dir**</i>: [string]
 
-| Description                                                                                                                      | Default |
-| -------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| Path to the autotuning experiment results directory. If None, "autotuning_results" under the training script launching path is used. | `null`  |
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | --------------------- |
+| Path to the autotuning experiment results directory.  The default appears in the working directory from which Deepspeed was launched. | "autotuning_results"  |
 
 <i>**exps_dir**</i>: [string]
 
-| Description                                                                                                                        | Default |
-| ---------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| Path to the auotuning experiment descriptions directory. If None, "autotuning_exps" under the train script launching path is used. | `null`  |
+| Description                                                                                                                              | Default |
+| ---------------------------------------------------------------------------------------------------------------------------------------- | ------------------ |
+| Path to the auotuning experiment descriptions directory. The default appears in the working directory from which Deepspeed was launched. | "autotuning_exps"  |
 
 <i>**overwrite**</i>: [boolean]
 
@@ -888,7 +898,132 @@ Configuring the asynchronous I/O module for offloading parameter and optimizer s
   }
 ```
 
+### Data Efficiency
+DeepSpeed Data Efficiency Library includes two techniques: curriculum learning and random layerwise token dropping (random-LTD). Read more about how to use the DeepSpeed Data Efficiency Library in our [tutorial](/tutorials/data-efficiency/).
+
+```json
+"data_efficiency": {
+  "enabled": true,
+  "seed": 1234,
+  "data_routing": {
+    "enabled": true,
+    "random_ltd":{
+      "enabled": true,
+      "total_layer_num": 24,
+      "random_ltd_layer_num": 22,
+      "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
+      "model_mask_name": "attention_mask",
+      "model_type": "decoder",
+      "hidden_state_order": "seq_batch_dim",
+      "random_ltd_schedule": {
+        "min_value": 128,
+        "max_value": 2048,
+        "schedule_type":"fixed_linear",
+        "schedule_config": {
+          "require_steps": 200000,
+          "seq_per_step": 16
+        }
+      }
+    }
+  },
+  "data_sampling": {
+    "enabled": true,
+    "num_epochs": 1,
+    "num_workers": 0,
+    "curriculum_learning": {
+      "enabled": true,
+      "data_cluster_path": "/path/to/data_clusters",
+      "curriculum_metrics": {
+        "vocabularyrarity": {
+          "index_to_sample_path": "/path/to/index_to_sample",
+          "index_to_metric_path": "/path/to/index_to_metric",
+          "difficulty_type": "percentile",
+          "clustering_type": "schedule_based",
+          "min_difficulty": 1,
+          "max_difficulty": 100,
+          "schedule_type": "fixed_root",
+          "schedule_config": {
+            "total_curriculum_step": 110000,
+            "difficulty_step": 1,
+            "root_degree": 2
+          }
+        }
+      }
+    }
+  }
+}
+```
+
+<i>**data_efficiency**</i>: [dictionary]
+
+| Fields | Value | Default |
+| ----- | ----- | ----- |
+| <i>**enabled**</i>: [boolean] | Enable data efficiency or not. | `false` |
+| <i>**seed**</i>: [integer] | Random seed for data sampling. | 1234 |
+| <i>**data_routing**</i>: [dictionary] | Configs for data routing techniques. | N/A |
+| <i>**data_sampling**</i>: [dictionary] | Configs for data sampling techniques. | N/A |
+
+<i>**data_routing**</i>: [dictionary]
+
+| Fields | Value | Default |
+| ----- | ----- | ----- |
+| <i>**enabled**</i>: [boolean] | Enable data routing techniques or not. | `false` |
+| <i>**random_ltd**</i>: [dictionary] | Configs for random-LTD technique. | N/A |
+
+<i>**data_sampling**</i>: [dictionary]
+
+| Fields | Value | Default |
+| ----- | ----- | ----- |
+| <i>**enabled**</i>: [boolean] | Enable data sampling techniques or not. | `false` |
+| <i>**num_epochs**</i>: [integer] | At most how many epoches of the original dataset will be iterated. | 1000 |
+| <i>**num_workers**</i>: [integer] | Data loader number of workers. | 0 |
+| <i>**curriculum_learning**</i>: [dictionary] | Configs for curriculum learing technique. | N/A |
+
+<i>**random_ltd**</i>: [dictionary]
+
+| Fields | Value | Default |
+| ----- | ----- | ----- |
+| <i>**enabled**</i>: [boolean] | Enable random-LTD technique or not. | `false` |
+| <i>**total_layer_num**</i>: [integer] | The number of layer (or the depth) for the pretraining/fine-tuning model. | N/A |
+| <i>**random_ltd_layer_num**</i>: [integer] | The number of layers that will be applied with random-LTD. | N/A |
+| <i>**random_ltd_layer_id**</i>: [list] | The exact layer_id that will be applied with random-LTD. The length of this list must be the same as `random_ltd_layer_num`. | N/A |
+| <i>**model_mask_name**</i>: [str] | The variable name of the attention_mask. Different libraries have different names, such as att_mask. For huggingface model, it’s named “attention_mask”. Users need to check the forward function in the original model files. If the attention mask input in the original model's forward function is not a keyword/named argument (e.g., attention_mask=None), user would need to change it to a keyword/named argument and provide that keyword as `model_mask_name`. | N/A |
+| <i>**model_type**</i>: [str] | Users need to identify whether the model is `decoder` or `encoder`. Currently we only support these two. | N/A |
+| <i>**hidden_state_order**</i>: [str] | Users need to know the input order of the hidden state tensor. Normally, it’s batch, sequence and then the hidden dimension, which is `batch_seq_dim`. Somethings, the order between batch and sequence will be switch like `seq_batch_dim`. Currently, we support these two.  | N/A |
+| <i>**random_ltd_schedule**</i>: [dictionary] | The schedule of the effective sequence length after token dropping. It's a linear function where random-LTD gradually drops less tokens and increases effective sequence length. | N/A |
+| <i>&emsp;&emsp;**min_value**</i>: [integer] | The initial effective sequence length (after token dropping) at step/iteration 0. | N/A |
+| <i>&emsp;&emsp;**max_value**</i>: [integer] | The max effective sequence length (usually the case without any token dropping). Usually this is set as baseline's seqlen. | N/A |
+| <i>&emsp;&emsp;**schedule_type**</i>: [str] | The sequence length follows a linear increasing function starting from `min_value` and reaching `max_value`. We currently only support this type. | N/A |
+| <i>&emsp;&emsp;**schedule_config**</i>: [dictionary] | Configs for the linear increasing function. | N/A |
+| <i>&emsp;&emsp;&emsp;&emsp;**require_steps**</i>: [integer] | How many iterations will be needed to reach max_value from min_value. | N/A |
+| <i>&emsp;&emsp;&emsp;&emsp;**seq_per_step**</i>: [integer] | At any time, the effective sequence length be multiple of this `seq_per_step`. Set this to multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. | N/A |
+
+<i>**curriculum_learning**</i>: [dictionary]
+
+| Fields | Value | Default |
+| ----- | ----- | ----- |
+| <i>**enabled**</i>: [boolean] | Enable curriculum learing technique or not. | `false` |
+| <i>**data_cluster_path**</i>: [str] | Path to directory where curriculum learning will store the indexes of data samples within the same difficulty ranges. | N/A |
+| <i>**curriculum_metrics**</i>: [dictionary] | This dictionary includes all desired curriculum metrics and their configs. Each metric will be a separate sub-dictionary, where the key is the metric name and the values are configs below. | N/A |
+| <i>&emsp;&emsp;**index_to_sample_path**</i>: [str] | Path to the index_to_sample file generated during offline data analysis. Note that data analysis will generate two kinds of index_to_sample files: The metric_name_index_to_sample_percentile_merged file is a concatenated index for perf improvement, but it only works when you set difficulty_type=`percentile`. If you use difficulty_type=`value`, you need to change this to use the metric_name_index_to_sample file. | N/A |
+| <i>&emsp;&emsp;**index_to_metric_path**</i>: [str] | Path to the index_to_metric_path file generated during offline data analysis. | N/A |
+| <i>&emsp;&emsp;**difficulty_type**</i>: [str] | During training, how to increase the max accepted difficulty. Currently support `value` (increase by absolute value) and `percentile` (increase by difficulty percentile). | N/A |
+| <i>&emsp;&emsp;**clustering_type**</i>: [str] | Currently support `schedule_based` (cluster data based on the difficulty schedule (pacing function) below) and `single_cluster` (no clustering required and probably CL is achieved by data postprocessing, such as sequence length truncation). | N/A |
+| <i>&emsp;&emsp;**min_difficulty**</i>: [integer] | Starting difficulty at first step. When difficulty_type=`value` the `min_difficulty` is an absolute difficulty value. When difficulty_type=`percentile` the `min_difficulty` is a difficulty percentile value. | N/A |
+| <i>&emsp;&emsp;**max_difficulty**</i>: [integer] | Final max difficulty. When difficulty_type=`value` the `max_difficulty` is an absolute difficulty value. When difficulty_type=`percentile` the `max_difficulty` is a difficulty percentile value. | N/A |
+| <i>&emsp;&emsp;**schedule_type**</i>: [str] | The difficulty schedule (pacing function) that defines how the max accepted difficulty increases from `min_difficulty` to `max_difficulty` during training. Currently support `fixed_linear`, `fixed_root`, `fixed_discrete`, and `custom`. | N/A |
+| <i>&emsp;&emsp;**schedule_config**</i>: [dictionary] | Configs for the pacing function. When schedule_type=`custom` this dictionary is not necessary. Instead user needs to provide a callback function (via the `set_custom_curriculum_learning_schedule` API in deepspeed/runtime/engine.py) which will update the max accepted difficulty during training. Configs below are all belongs to `schedule_config`. | N/A |
+| <i>&emsp;&emsp;&emsp;&emsp;**total_curriculum_step**</i>: [integer] | How many steps the curriculum learning takes to go from min difficulty to max difficulty. Used by `fixed_linear` and `fixed_root` schedule. | N/A |
+| <i>&emsp;&emsp;&emsp;&emsp;**difficulty_step**</i>: [integer] | The max accepted difficulty level determined every step must be a multiple of this `difficulty_step`. This is used to ensure the use of NVIDIA Tensor Core acceleration (requires multiple of 8 (FP16) or 16 (INT8)). Used by `fixed_linear` and `fixed_root` schedule. | N/A |
+| <i>&emsp;&emsp;&emsp;&emsp;**root_degree**</i>: [integer] | The degree of the root function. Degree of 2 means square root and degree of 3 means cube root. Degree of 1 is equivalent to linear. Used by `fixed_root` schedule. | N/A |
+| <i>&emsp;&emsp;&emsp;&emsp;**difficulty**</i>: [list] | List of max accepted difficulty levels to be used during schedule. Used by `fixed_discrete` schedule. | N/A |
+| <i>&emsp;&emsp;&emsp;&emsp;**max_step**</i>: [list] | List of which step to change max accepted difficulty level. Used by `fixed_discrete` schedule. | N/A |
+
+
 ### Curriculum Learning
+
+**Note:** On 12/12/2022, we released [DeepSpeed Data Efficiency Library](/tutorials/data-efficiency/) which provides a more general curriculum learning support. This legacy curriculum learning feature below is still supported but we recommend to use the Data Efficiency Library.
+
 ```json
   "curriculum_learning": {
     "enabled": true,
@@ -964,13 +1099,15 @@ Configuring the asynchronous I/O module for offloading parameter and optimizer s
 | ---------------------------------------------------------------------------------------------------------------------------- | ------- |
 | List of which step to change difficulty level. One of the `schedule_config` when the `fixed_discrete` schedule_type is used. | N/A     |
 
-### Logging to Tensorboard
+### Monitoring Module (TensorBoard, WandB, CSV)
 
 **Note:** Deepspeed logs to TensorBoard through PyTorch. Logging to TensorBoard requires that the `tensorboard` package is installed (read more in the [PyTorch documentation](https://pytorch.org/docs/1.8.0/tensorboard.html)).
 {: .notice--warning}
+**Note:** Logging to WandB requires that the `wandb` package is installed (read more in the [WandB documentation](https://docs.wandb.ai/quickstart)).
+{: .notice--warning}
 
 
-Deepspeed can log training details into a [Tensorboard](https://www.tensorflow.org/tensorboard)-compatible file. Below is an overview of what deepspeed will log.
+Deepspeed's Monitor module can log training details into a [Tensorboard](https://www.tensorflow.org/tensorboard)-compatible file, to [WandB](https://wandb.ai/site), or to simple CSV files. Below is an overview of what DeepSpeed will log automatically.
 
 | Field | Description                                                                                                                                                                                                                                                                                               |Conditions |
 | ------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----- |
@@ -989,11 +1126,11 @@ Deepspeed can log training details into a [Tensorboard](https://www.tensorflow.o
 | Fields | Value                                                                                                                                                                                                                                                                                                        |Default |
 | ------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----- |
 | enabled   | Whether logging to [Tensorboard](https://www.tensorflow.org/tensorboard) is enabled. | `false` |
-| job_name  | Name for the current job. This will become a new directory inside `output_path` | `"DeepSpeedJobName"` |
-| output_path | Path to where the Tensorboard logs will be written.                           | `~/tensorboard/` |
+| output_path | Path to where the Tensorboard logs will be written. If None, the output path is set under the training script's launching path.     | `null` |
+| job_name  | Name for the current job. This will become a new directory inside `output_path`. | `"DeepSpeedJobName"` |
 
 
-Example of <i>** tensorboard**</i> configuration:
+Example of <i>**tensorboard**</i> configuration:
 
 ```json
 "tensorboard": {
@@ -1002,3 +1139,517 @@ Example of <i>** tensorboard**</i> configuration:
     "job_name": "train_bert"
 }
 ```
+
+<i>**wandb**</i>: [dictionary]
+
+| Fields | Value                                                                                                                                                                                                                                                                                                        |Default |
+| ------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----- |
+| enabled   | Whether logging to [WandB](https://wandb.ai/site) is enabled. | `false` |
+| group  | Name for the WandB group. This can be used to group together runs. | `None` |
+| team | Name for the WandB team.       | `None` |
+| project | Name for the WandB project.       | `deepspeed` |
+
+
+Example of <i>**wandb**</i> configuration:
+
+```json
+"wandb": {
+    "enabled": true,
+    "group": "my_group",
+    "team": "my_team",
+    "project": "my_project"
+}
+```
+
+<i>**csv_monitor**</i>: [dictionary]
+
+| Fields | Value                                                                                                                                                                                                                                                                                                        |Default |
+| ------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----- |
+| enabled   | Whether logging to local CSV files is enabled. | `false` |
+| output_path | Path to where the csv files will be written. If None, the output path is set under the training script's launching path.      | `null` |
+| job_name  | Name for the current job. This will become a new directory inside `output_path` | `"DeepSpeedJobName"` |
+
+
+Example of <i>**csv_monitor**</i> configuration:
+
+```json
+"csv_monitor": {
+    "enabled": true,
+    "output_path": "output/ds_logs/",
+    "job_name": "train_bert"
+}
+```
+
+### Elastic Training Config (V0.1 and V0.2)
+
+```json
+  "elasticity": {
+    "enabled": true,
+    "max_train_batch_size": "seqlen",
+    "micro_batch_sizes": 8,
+    "min_gpus": 1024,
+    "max_gpus": "fixed_linear",
+    "min_time": "seqlen",
+    "version": 8,
+    "ignore_non_elastic_batch_info": 1024,
+    "num_gpus_per_node": "fixed_linear",
+    "model_parallel_size": MODEL_PARALLEL_SIZE
+  }
+```
+
+| Field | Description                                                                                                                                                                                                                                                                                                   |Default|
+| ------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----- |
+| `enabled`   | Enables computation of global batch size in elastic training. | false |
+| `max_train_batch_size` | Max acceptable batch size can be used in training. | 2000 |
+| `micro_batch_sizes` | Acceptable micro batch sizes, same as train_micro_batch_size_per_gpu | [2,4,6] |
+| `min_gpus` | Min number of GPUs to search over when computing highly composite batch size in v0.1 and v0.2. | 1 |
+| `max_gpus` | Max number of GPUs to search over when computing highly composite batch size in v0.1 and v0.2. | 10000 |
+| `min_time` |Minimum running time (minutes) before the scheduler will scale again (only used in v0.1). 0 implies it's unknown | 0 |
+| `prefer_large_batch` | When finding a suitable batch size, attempt to find one that is closest to the max train batch size given. | true |
+| `version` | Version of elastic logic to use. | 0.2 |
+| `ignore_non_elastic_batch_info` | Ignore all batch info provided outside the elastic config. To reduce confusion, we require all batch related info to be given in elastic config only. | false |
+| `num_gpus_per_node` | Number of GPUs per node. This information is used by v0.2 to support model-parallel training (only used by v0.2) | 1 |
+| `model_parallel_size` | Tensor or model parallel size (only used by v0.2) | 1 |
+
+
+### Communication Logging
+
+
+DeepSpeed provides a flexible communication logging tool which can automatically detect and record communication operations launched via `deepspeed.comm`. NOTE: All logging communication calls are synchronized in order to provide accurate timing information. This may hamper performance if your model heavily uses asynchronous communication operations.
+
+Once the logs are populated, they can be summarized with `deepspeed.comm.log_summary()`. For more detail and example usage, see the [tutorial](/tutorials/comms-logging/)
+
+
+
+
+<i>**comms_logger**</i>: [dictionary]
+
+| Fields | Value                                                                                                                                                                                                                                                                                                        |Default |
+| ------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----- |
+| enabled   | Whether communication logging is enabled. | `false` |
+| verbose | Whether to immediately print every communication operation  | `false` |
+| prof_all  | Whether to profile all operations. | `true` |
+| debug  | Appends the caller function to each communication operation's `log_name`. | `false` |
+| prof_ops  | A list of communication operations to log (only the specified ops will be profiled). | `[]` |
+
+
+Example of recommended <i>**comms_logger**</i> configuration:
+
+```json
+"comms_logger": {
+  "enabled": true,
+  "verbose": false,
+  "prof_all": true,
+  "debug": false
+}
+```
+
+Example of <i>**comms_logger**</i> configuration for logging specific operations only:
+
+```json
+"comms_logger": {
+  "enabled": true,
+  "verbose": false,
+  "prof_all": false,
+  "debug": false,
+  "prof_ops": ["all_reduce", "all_gather"]
+}
+```
+### Compression
+**Note:** <i>**Compression**</i> has seven different components, including layer reduction, weight quantization, activation quantization, sparse pruning, row pruning, head pruning, and channel pruning. We explain them one by one with simple json examples. Read more about how to use the DeepSpeed Compression library in our [tutorial](/tutorials/model-compression/).
+
+#### Layer Reduction
+**Note:** Layer reduction works much better when using knowledage distillation (learn more in our [tutorial](/tutorials/model-compression/)):
+
+```json
+"compression_training": {
+    "layer_reduction": {
+      "enabled": true,
+      "keep_number_layer": 5,
+      "module_name_prefix": "bert.encoder.layer",
+      "teacher_layer": [
+        2,
+        4,
+        6,
+        8,
+        10
+      ],
+      "other_module_name": [
+        "bert.pooler",
+        "bert.embeddings",
+        "classifier"
+      ]
+    }
+  }
+```
+
+<i>**layer_reduction**</i>: [dictionary]
+
+| Fields | Value | Default |
+| ----- | ----- | ----- |
+| <i>**enabled**</i>: [boolean] | Enable layer reduction or not. | `false` |
+| <i>**keep_number_layer**</i>: [list] | The number of layer in the model to be kept. | N/A |
+| <i>**module_name_prefix**</i>: [str] | The (uniform) name prefix of the model's modules of which the associated weight parameters are to be reinitialized. | N/A |
+| <i>**teacher_layer**</i>: [list] | The layer of the weight parameters are to be reinitialized. The length of the list equals to 'keep_number_layer'. | N/A |
+| <i>**other_module_name**</i>: [list] | The name of modules of which the associated weight parameters are to be reinitialized. It is an complemenatory or alternative of module_name_prefix. For instance,  "other_module_name": ["bert.encoder.layer.2","bert.encoder.layer.4"] equals to "module_name_prefix":"bert.encoder.layer" and  "teacher_layer": [2,4]. | N/A |
+
+#### Weight Quantization
+```json
+  "compression_training": {
+  "weight_quantization": {
+    "shared_parameters":{
+      "enabled": true,
+      "quantizer_kernel": false,
+      "schedule_offset": 0,
+      "quantize_groups": 1,
+      "quantize_verbose": false,
+      "quantization_type": "symmetric",
+      "rounding": "nearest",
+      "quantize_weight_in_forward": false,
+      "fp16_mixed_quantize":{
+        "enabled": false,
+        "quantize_change_ratio": 0.001
+      }
+    },
+    "different_groups":{
+      "wq1": {
+        "params": {
+            "start_bits": 8,
+            "target_bits": 8,
+            "quantization_period": 50
+        },
+        "modules": [
+          "attention.self",
+          "intermediate"
+        ]
+      },
+      "wq2": {
+        "params": {
+            "start_bits": 4,
+            "target_bits": 4,
+            "quantization_period": 50
+        },
+        "modules": [
+          "attention.output"
+        ]
+      }
+    }
+  }
+  }
+```
+
+<i>**shared_parameters**</i>: [dictionary]
+
+Shared parameters for all weight quantization groups.
+
+| Fields | Value | Default |
+| ----- | ----- | ----- |
+| <i>**enabled**</i>: [boolean] | Enable weight quantization or not. | `false` |
+| <i>**quantizer_kernel**</i>: [boolean] | Use DeepSpeed quantization kernel for >=4 bit quantization. This can only be enabled when using DeepSpeed FP16 optimizer. | `false` |
+| <i>**schedule_offset**</i>: [integer] | Enable weight quantization after scheduled steps (can be treated as warmup steps). | `0` |
+| <i>**quantize_groups**</i>: [integer] | Split the weight matrix into different number of groups, and each of them has its own scaling factor. | `1` |
+| <i>**quantize_verbose**</i>: [boolean] | Print the quantization related logs. | `false` |
+| <i>**quantization_type**</i>: [string] | Choose the quantization algorithm, symmetric or asymmetric. | `"symmetric"` |
+| <i>**rounding**</i>: [string] | Rounding algorithm associated with quantization, nearest or stochastic. | `"nearest"` |
+| <i>**quantize_weight_in_forward**</i>: [boolean] | Quantize weight in optimizer or forward step, must set to be true for FP32 optimizer training. | `false` |
+| <i>**fp16_mixed_quantize**</i>: [dictionary] | Using the value mixed by FP16 value and the quantized value. | N/A |
+| <i>&emsp;&emsp;**enabled**</i>: [boolean] | Whether fp16 mixed quantization is enabled. | `false` |
+| <i>&emsp;&emsp;**quantize_change_ratio**</i>: [float] | Initial quantize value ratio, will gradually increase to 1. | `0.001` |
+
+<i>**different_groups**</i>: [dictionary]
+
+Different quantization sets, this is used for different quantization parameters. In this example, we give two different sets. In practice, you can choose the number of sets based on your requirements.
+
+| Fields | Value | Default |
+| ----- | ----- | ----- |
+| <i>**params**</i>: [dictionary] | | |
+| <i>&emsp;&emsp;**start_bits**</i>: [integer] | Quantization starting bits, will gradaully reduce to target bits. | `8` |
+| <i>&emsp;&emsp;**target_bits**</i>: [integer] | Quantization target bits, need to be <= start_bits. | `8` |
+| <i>&emsp;&emsp;**quantization_period**</i>: [integer] | For every n steps, the quantization bits will be reduce by 1. | `1` |
+| <i>**modules**</i>: [list] | Scope of weight parameters associated to the params setting. | `"All Linear and CONV2D layers"` |
+
+#### Activation Quantization
+```json
+"compression_training": {
+  "activation_quantization": {
+    "shared_parameters":{
+      "enabled": true,
+      "quantization_type": "asymmetric",
+      "range_calibration": "dynamic",
+      "schedule_offset": 50
+    },
+    "different_groups":{
+      "aq1": {
+        "params": {
+            "bits": 8
+        },
+        "modules": [
+          "attention.output"
+        ]
+      }
+    }
+  }
+```
+
+<i>**shared_parameters**</i>: [dictionary]
+
+Shared parameters for all activation quantization groups.
+
+| Fields | Value | Default |
+| ----- | ----- | ----- |
+| <i>**enabled**</i>: [boolean] | Enable activation quantization or not. | `false` |
+| <i>**quantization_type**</i>: [string] | Choose the quantization algorithm, symmetric or asymmetric. | `"symmetric"` |
+| <i>**range_calibration**</i>: [string] | Using dynamic (per token or per image) or static (fixed min/max using momentum) for inference. | `"static"` |
+| <i>**schedule_offset**</i>: [integer] | Enable activation quantization after scheduled steps (can be treated as warmup steps). | `0` |
+
+<i>**different_groups**</i>: [dictionary]
+
+Different quantization sets, this is used for different quantization parameters. In this example, we give one set. In practice, you can choose the number of sets based on your requirements.
+
+| Fields | Value | Default |
+| ----- | ----- | ----- |
+| <i>**params**</i>: [dictionary] | | |
+| <i>&emsp;&emsp;**bits**</i>: [integer] | Number of bits used for activation target bits, need to be >= 4. | `8` |
+| <i>**modules**</i>: [list] | Scope of weight parameters associated to the params setting. | `"All Linear and CONV2D layers"` |
+
+#### Sparse Pruning
+```json
+"compression_training": {
+  "sparse_pruning":{
+    "shared_parameters":{
+      "enabled": true,
+      "schedule_offset": 30,
+      "method": "l1"
+    },
+    "different_groups":{
+      "sp1": {
+        "params": {
+            "dense_ratio": 0.5
+        },
+        "modules": [
+          "attention.self"
+        ]
+      }
+    }
+  }
+}
+```
+
+<i>**shared_parameters**</i>: [dictionary]
+
+Shared parameters for all sparse pruning groups.
+
+| Fields | Value | Default |
+| ----- | ----- | ----- |
+| <i>**enabled**</i>: [boolean] | Enable sparse pruning or not. | `false` |
+| <i>**schedule_offset**</i>: [integer] | Enable sparse pruning after scheduled steps (can be treated as warmup steps). | `0` |
+| <i>**method**</i>: [string] | Choose different pruning methods, l1 (static, magnitude based) or topk (dynamic, learnable). | `"l1"` |
+
+<i>**different_groups**</i>: [dictionary]
+
+Different pruning sets, this is used for different pruning parameters. In this example, we give one set. In practice, you can choose the number of sets based on your requirements.
+
+| Fields | Value | Default |
+| ----- | ----- | ----- |
+| <i>**params**</i>: [dictionary] | | |
+| <i>&emsp;&emsp;**dense_ratio**</i>: [float] | The percentage of weights to keep after pruning. | `0.5` |
+| <i>**modules**</i>: [list] | Scope of weight parameters associated to the params setting. | `"All Linear and CONV2D layers"` |
+
+#### Row Pruning
+**Note:** <i>**Row Pruning**</i> is a feature designed for two back-to-back linear layers (e.g., Feed Forward Network in Transformers). As such, we suggested use row pruning for the first linear layer (i.e., the `intermediate.dense` layer for BERT). Reducing the row dimension of this matrix can help reducing the column of the follow-up matrix (i.e., `layer.\\w+.output.dense` layer for BERT). It should also work for other linear layers as well.
+```json
+"compression_training": {
+  "row_pruning":{
+    "shared_parameters":{
+      "enabled": true,
+      "schedule_offset": 20,
+      "method": "topk"
+    },
+    "different_groups":{
+      "rp1": {
+        "params": {
+            "dense_ratio": 0.5
+        },
+        "modules": [
+          "intermediate.dense"
+        ],
+        "related_modules":[
+          ["layer.\\w+.output.dense"]
+        ]
+      }
+    }
+  }
+}
+```
+
+<i>**shared_parameters**</i>: [dictionary]
+
+Shared parameters for all row pruning groups.
+
+| Fields | Value | Default |
+| ----- | ----- | ----- |
+| <i>**enabled**</i>: [boolean] | Enable row pruning or not. | `false` |
+| <i>**schedule_offset**</i>: [integer] | Enable row pruning after scheduled steps (can be treated as warmup steps). | `0` |
+| <i>**method**</i>: [string] | Choose different pruning methods, l1 (static, magnitude based) or topk (dynamic, learnable). | `"l1"` |
+
+<i>**different_groups**</i>: [dictionary]
+
+Different pruning sets, this is used for different pruning parameters. In this example, we give one set. In practice, you can choose the number of sets based on your requirements.
+
+| Fields | Value | Default |
+| ----- | ----- | ----- |
+| <i>**params**</i>: [dictionary] | | |
+| <i>&emsp;&emsp;**dense_ratio**</i>: [float] | The percentage of weights to keep after pruning. | `0.5` |
+| <i>**modules**</i>: [list] | Scope of weight parameters associated to the params setting. | `"All Linear and CONV2D layers"` |
+| <i>**related_modules**</i>: [list[list]] | Related module to the row pruned module, which can be performed column pruning. | `None` |
+
+#### Head Pruning
+**Note:** <i>**Head Pruning**</i> is a feature designed for two attention layers (e.g., Multi Head Attention in Transformers). For now, it can only be applied to output matrix of the Transformer (i.e., `attention.output.dense` in BERT). Pruning the output matrix can lead to the pruning of Query/Key/Value matrix as well.
+```json
+"compression_training": {
+  "head_pruning":{
+    "shared_parameters":{
+      "enabled": true,
+      "schedule_offset": 10,
+      "method": "topk",
+      "num_heads": 12
+    },
+    "different_groups":{
+      "rp1": {
+        "params": {
+            "dense_ratio": 0.5
+        },
+        "modules": [
+          "attention.output.dense"
+        ],
+        "related_modules":[
+          ["self.query", "self.key", "self.value"]
+        ]
+      }
+    }
+  }
+}
+
+```
+
+<i>**shared_parameters**</i>: [dictionary]
+
+Shared parameters for all head pruning groups.
+
+| Fields | Value | Default |
+| ----- | ----- | ----- |
+| <i>**enabled**</i>: [boolean] | Enable head pruning or not. | `false` |
+| <i>**schedule_offset**</i>: [integer] | Enable head pruning after scheduled steps (can be treated as warmup steps). | `0` |
+| <i>**method**</i>: [string] | Choose different pruning methods. For now, we only support topk (dynamic, learnable). | `"topk"` |
+| <i>**num_heads**</i>: [int] | Number of heads (must be provided by user). | N/A |
+
+<i>**different_groups**</i>: [dictionary]
+
+Different pruning sets, this is used for different pruning parameters. In this example, we give one set. In practice, you can choose the number of sets based on your requirements.
+
+| Fields | Value | Default |
+| ----- | ----- | ----- |
+| <i>**params**</i>: [dictionary] | | |
+| <i>&emsp;&emsp;**dense_ratio**</i>: [float] | The percentage of weights to keep after pruning. | `0.5` |
+| <i>**modules**</i>: [list] | Scope of weight parameters associated to the params setting. | `"All Linear and CONV2D layers"` |
+| <i>**related_modules**</i>: [list[list]] | Related module (Usually Q/K/V) to the head pruned module (i.e., the output matrix). For now, this feature only works for BERT. | `None` |
+
+#### Channel Pruning
+**Note:** <i>**Channel Pruning**</i> is a feature designed for two back-to-back CONV2d layers (e.g., residual connection in ResNet). As such, we suggested use channel pruning for the first CONV2d layer. Reducing the number of output channels of this layer can help reducing the number of input channels the follow-up layer. It should also work for other CONV2d layers as well.
+```json
+"compression_training": {
+"channel_pruning":{
+      "shared_parameters":{
+        "enabled": true,
+        "schedule_offset": 0,
+        "method": "topk"
+      },
+      "different_groups":{
+        "cp1": {
+          "params": {
+              "dense_ratio": 0.5
+          },
+          "modules": [
+            "layer....conv1"
+          ],
+          "related_modules": [
+            ["layer....conv2", "layer....bn1"]
+          ]
+        }
+      }
+    }
+}
+```
+
+<i>**shared_parameters**</i>: [dictionary]
+
+Shared parameters for all channel pruning groups.
+
+| Fields | Value | Default |
+| ----- | ----- | ----- |
+| <i>**enabled**</i>: [boolean] | Enable channel pruning or not. | `false` |
+| <i>**schedule_offset**</i>: [integer] | Enable channel pruning after scheduled steps (can be treated as warmup steps). | `0` |
+| <i>**method**</i>: [string] | Choose different pruning methods, l1 (static, magnitude based) or topk (dynamic, learnable). | `"l1"` |
+
+<i>**different_groups**</i>: [dictionary]
+
+Different pruning sets, this is used for different pruning parameters. In this example, we give one set. In practice, you can choose the number of sets based on your requirements.
+
+| Fields | Value | Default |
+| ----- | ----- | ----- |
+| <i>**params**</i>: [dictionary] | | |
+| <i>&emsp;&emsp;**dense_ratio**</i>: [float] | The percentage of weights to keep after pruning. | `0.5` |
+| <i>**modules**</i>: [list] | Scope of weight parameters associated to the params setting. | `"All CONV2D layers"` |
+| <i>**related_modules**</i>: [list[list]] | Related module to the channel pruned module. | `None` |
+
+### Checkpoint options
+
+```json
+"checkpoint": {
+    "tag_validation"="Warn",
+    "load_universal"=false,
+    "use_node_local_storage"=false,
+    "parallel_write":{
+        "pipeline_stage": false
+    }
+}
+```
+
+<i>**tag_validation**</i>: ["Ignore"|"Warn"|"Fail"]
+
+| Description                                                                                                                            | Default |
+| -------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Enables level of checking to ensure checkpoint tags are consistent across all ranks. Useful when restoring with different world sizes. |  "Warn" |
+
+<i>**load_universal**</i>: [boolean]
+
+| Description                            | Default |
+| -------------------------------------- | ------- |
+| Load the latest checkpoint for all.    | `false` |
+
+<i>**use_node_local_storage**</i>: [boolean]
+
+| Description                                                                                                                                                               | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| If `true` DeepSpeed will store model parameter states and checkpoint states based on local rank allowing checkpoints to be loaded without access to a shared filesystem.  | `false` |
+
+<i>**pipeline_stage**</i>: [boolean]
+
+| Description                                                   | Default |
+| ------------------------------------------------------------- | ------- |
+| Use pipeline stages to parallelize the writing of checkpoints.| `false` |
+
+### Data Type options
+
+```json
+"data_types": {
+    "grad_accum_dtype"=["fp32"|"fp16"|"bf16"]
+    }
+}
+```
+
+<i>**grad_accum_dtype**</i>: ["fp32"|"fp16"|"bf16"]
+
+| Description                                                                                                   | Default |
+| --------------------------------------------------------------------------------------------------------------| ------- |
+| Specifies the data type in which to do gradient accumulation. If None the default is to match the model type. |  None   |
diff --git a/docs/_pages/inference.md b/docs/_pages/inference.md
new file mode 100755
index 0000000000000000000000000000000000000000..d63604e1f022b4686b81f4988b0e1e7b6f75a6a0
--- /dev/null
+++ b/docs/_pages/inference.md
@@ -0,0 +1,13 @@
+---
+title: "Inference Overview and Features"
+layout: single
+permalink: /inference/
+toc: true
+toc_label: "Contents"
+---
+
+DeepSpeed-Inference introduces several features to efficiently serve transformer-based PyTorch models. It supports model parallelism (MP) to fit large models that would otherwise not fit in GPU memory. Even for smaller models, MP can be used to reduce latency for inference. To further reduce latency and cost, we introduce inference-customized kernels. Finally, we propose a novel approach to quantize models, called MoQ, to both shrink the model and reduce the inference cost at production. For more details on the inference related optimizations in DeepSpeed, please refer to our [blog post](https://www.microsoft.com/en-us/research/blog/deepspeed-accelerating-large-scale-model-inference-and-training-via-system-optimizations-and-compression/).
+
+DeepSpeed provides a seamless inference mode for compatible transformer based models trained using DeepSpeed, Megatron, and HuggingFace, meaning that we don’t require any change on the modeling side such as exporting the model or creating a different checkpoint from your trained checkpoints. To run inference on multi-GPU for compatible models, provide the model parallelism degree and the checkpoint information or the model which is already loaded from a checkpoint, and DeepSpeed will do the rest. It will automatically partition the model as necessary, inject compatible high performance kernels into your model and manage the inter-gpu communication. For list of compatible models please see [here](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py).
+
+To get started with DeepSpeed-Inference, please checkout our [tutorial](https://www.deepspeed.ai/tutorials/inference-tutorial/).
diff --git a/docs/_pages/training.md b/docs/_pages/training.md
new file mode 100644
index 0000000000000000000000000000000000000000..466800a3d9870a53fb593ecb233790c4fdf52df6
--- /dev/null
+++ b/docs/_pages/training.md
@@ -0,0 +1,580 @@
+---
+title: "Training Overview and Features"
+layout: single
+permalink: /training/
+toc: true
+toc_label: "Contents"
+---
+
+# Overview
+Training advanced deep learning models is challenging. Beyond model design,
+model scientists also need to set up the state-of-the-art training techniques
+such as distributed training, mixed precision, gradient accumulation, and
+checkpointing. Yet still, scientists may not achieve the desired system
+performance and convergence rate. Large model sizes are even more challenging:
+a large model easily runs out of memory with pure data parallelism and it is
+difficult to use model parallelism. DeepSpeed addresses these challenges to
+accelerate model development *and* training.
+
+## Distributed, Effective, and Efficient Training with Ease
+The DeepSpeed API is a lightweight wrapper on [PyTorch](https://pytorch.org/). This
+means that you can use everything you love in PyTorch and without learning a new
+platform. In addition, DeepSpeed manages all of the boilerplate state-of-the-art
+training techniques, such as distributed training, mixed precision, gradient
+accumulation, and checkpoints so that you can focus on your model development. Most
+importantly, you can leverage the distinctive efficiency and effectiveness benefit of
+DeepSpeed to boost speed and scale with just a few lines of code changes to your PyTorch
+models.
+
+## Speed
+DeepSpeed achieves high performance and fast convergence through a combination of
+efficiency optimizations on compute/communication/memory/IO and effectiveness
+optimizations on advanced hyperparameter tuning and optimizers. For example:
+
+* <span style="color:dodgerblue">DeepSpeed trains BERT-large to parity in 44
+  mins using 1024 V100 GPUs (64 DGX-2 boxes) and in 2.4 hours using 256 GPUs
+  (16 DGX-2 boxes).</span>
+
+  **BERT-large Training Times**
+
+  | Devices        | Source    |        Training Time  |
+  | -------------- | --------- | ---------------------:|
+  | 1024 V100 GPUs | DeepSpeed |             **44** min|
+  | 256 V100 GPUs  | DeepSpeed |             **2.4** hr|
+  | 64 V100 GPUs   | DeepSpeed |            **8.68** hr|
+  | 16 V100 GPUs   | DeepSpeed |           **33.22** hr|
+
+  *BERT codes and tutorials will be available soon.*
+
+* DeepSpeed trains GPT2 (1.5 billion parameters) 3.75x faster than state-of-art, NVIDIA
+  Megatron on Azure GPUs.
+
+  *Read more*: [GPT tutorial](/tutorials/megatron/)
+
+
+
+## Memory efficiency
+DeepSpeed provides memory-efficient data parallelism and enables training models without
+model parallelism. For example, DeepSpeed can train models with up to 13 billion parameters on
+a single GPU. In comparison, existing frameworks (e.g.,
+PyTorch's Distributed Data Parallel) run out of memory with 1.4 billion parameter models.
+
+DeepSpeed reduces the training memory footprint through a novel solution called Zero
+Redundancy Optimizer (ZeRO). Unlike basic data parallelism where memory states are
+replicated across data-parallel processes, ZeRO partitions model states and gradients to save
+significant memory. Furthermore, it also reduces activation memory and fragmented memory.
+The current implementation (ZeRO-2) reduces memory by up to
+8x relative to the state-of-art. You can read more about ZeRO in our [paper](https://arxiv.org/abs/1910.02054), and
+in our blog posts related to
+[ZeRO-1](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/) and [ZeRO-2](https://www.microsoft.com/en-us/research/blog/zero-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/).
+
+With this impressive memory reduction, early adopters of DeepSpeed have already
+produced  a language model (LM) with over 17B parameters called
+<a href="https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft">
+<span style="color:dodgerblue">Turing-NLG</span></a>,
+establishing a new SOTA in the LM category.
+
+For model scientists with limited GPU resources, ZeRO-Offload leverages both CPU and GPU memory for training large models. Using a machine with **a single GPU**, our users can run **models of up to 13 billion parameters** without running out of memory, 10x bigger than the existing approaches, while obtaining competitive throughput. This feature democratizes multi-billion-parameter model training and opens the window for many deep learning practitioners to explore bigger and better models.
+
+## Scalability
+DeepSpeed supports efficient data parallelism, model parallelism, pipeline parallelism and their
+combinations, which we call 3D parallelism.
+* <span style="color:dodgerblue">3D parallelism of DeepSpeed provides system support to run models with trillions of parameters, read more in our [press-release]({{ site.press_release_v3 }}) and [tutorial](/tutorials/pipeline).</span>
+* <span style="color:dodgerblue">DeepSpeed can run large models more efficiently, up to 10x
+  faster for models with
+  various sizes spanning 1.5B to hundred billion.</span> More specifically, the data parallelism powered by ZeRO
+  is complementary and can be combined with different types of model parallelism.  It allows
+  DeepSpeed to fit models using lower degree of model parallelism and higher batch size, offering
+  significant performance gains compared to using model parallelism alone.
+
+  *Read more*: [ZeRO paper](https://arxiv.org/abs/1910.02054),
+  and [GPT tutorial](/tutorials/megatron).
+
+![DeepSpeed Speedup](/assets/images/deepspeed-speedup.png)
+<p align="center">
+<em>The figure depicts system throughput improvements of DeepSpeed (combining ZeRO-powered data parallelism with model parallelism of NVIDIA Megatron-LM) over using Megatron-LM alone.</em>
+</p>
+
+## Communication efficiency
+Pipeline parallelism of DeepSpeed reduce communication volume during distributed training, which allows users to train multi-billion-parameter models 2–7x faster on clusters with limited network bandwidth.
+![Low-bandwidth GPT-2 Performance](/assets/images/pp-lowbw-gpt2.png)
+
+1-bit Adam, 0/1 Adam and 1-bit LAMB reduce communication volume by up to 26x while achieving similar convergence efficiency to Adam, allowing for scaling to different types of GPU clusters and networks.  [1-bit Adam blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html), [1-bit Adam tutorial](https://www.deepspeed.ai/tutorials/onebit-adam/), [0/1 Adam tutorial](https://www.deepspeed.ai/tutorials/zero-one-adam/), [1-bit LAMB tutorial](https://www.deepspeed.ai/tutorials/onebit-lamb/).
+
+## Data efficiency
+DeepSpeed Data Efficiency Library provides efficient data sampling via curriculum learning and efficient data routing via random layerwise token dropping. The composed solution enables up to 2x data and 2x time saving during GPT-3/BERT pretraining and GPT/ViT finetuning, or further improve model quality under the same data/time. See more in [the tutorial](/tutorials/data-efficiency).
+
+## Supporting long sequence length
+DeepSpeed offers sparse attention kernels—an instrumental technology to support long sequences of model inputs, whether for text, image, or sound. Compared with the classic dense Transformers, it powers **an order-of-magnitude longer input sequence** and obtains up to 6x faster execution with comparable accuracy. It also outperforms state-of-the-art sparse implementations with 1.5–3x faster execution. Furthermore, our sparse kernels support efficient execution of flexible sparse format and empower users to innovate on their custom sparse structures.  [Read more here](https://www.deepspeed.ai/2020/09/08/sparse-attention.html).
+
+
+## Fast convergence for effectiveness
+DeepSpeed supports advanced hyperparameter tuning and large batch size
+optimizers such as [LAMB](https://arxiv.org/abs/1904.00962). These improve the
+effectiveness of model training and reduce the number of samples required to
+convergence to desired accuracy.
+
+*Read more*: [Tuning tutorial](/tutorials/one-cycle).
+
+
+## Good Usability
+Only a few lines of code changes are needed to enable a PyTorch model to use DeepSpeed and ZeRO. Compared to current model parallelism libraries, DeepSpeed does not require a code redesign or model refactoring. It also does not put limitations on model dimensions (such as number of attention heads, hidden sizes, and others), batch size, or any other training parameters. For models of up to 13 billion parameters, you can use ZeRO-powered data parallelism conveniently without requiring model parallelism, while in contrast, standard data parallelism will run out of memory for models with more than 1.4 billion parameters. In addition, DeepSpeed conveniently supports flexible combination of ZeRO-powered data parallelism with custom model parallelisms, such as tensor slicing of NVIDIA's Megatron-LM.
+
+
+## Features
+
+Below we provide a brief feature list, see our detailed [feature overview](https://www.deepspeed.ai/features/) for descriptions and usage.
+
+* [Distributed Training with Mixed Precision](https://www.deepspeed.ai/features/#distributed-training-with-mixed-precision)
+  * 16-bit mixed precision
+  * Single-GPU/Multi-GPU/Multi-Node
+* [Model Parallelism](https://www.deepspeed.ai/features/#model-parallelism)
+  * Support for Custom Model Parallelism
+  * Integration with Megatron-LM
+* [Pipeline Parallelism](https://www.deepspeed.ai/tutorials/pipeline/)
+  * 3D Parallelism
+* [The Zero Redundancy Optimizer](https://www.deepspeed.ai/tutorials/zero/)
+  * Optimizer State and Gradient Partitioning
+  * Activation Partitioning
+  * Constant Buffer Optimization
+  * Contiguous Memory Optimization
+* [ZeRO-Offload](https://www.deepspeed.ai/tutorials/zero-offload/)
+  * Leverage both CPU/GPU memory for model training
+  * Support 10B model training on a single GPU
+* [Ultra-fast dense transformer kernels](https://www.deepspeed.ai/2020/05/18/bert-record.html)
+* [Sparse attention](https://www.deepspeed.ai/2020/09/08/sparse-attention-news.html)
+  * Memory- and compute-efficient sparse kernels
+  * Support 10x long sequences than dense
+  * Flexible support to different sparse structures
+* [1-bit Adam](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html), [0/1 Adam](https://www.deepspeed.ai/tutorials/zero-one-adam/) and [1-bit LAMB](https://www.deepspeed.ai/tutorials/onebit-lamb/)
+  * Custom communication collective
+  * Up to 26x communication volume saving
+* [Additional Memory and Bandwidth Optimizations](https://www.deepspeed.ai/features/#additional-memory-and-bandwidth-optimizations)
+  * Smart Gradient Accumulation
+  * Communication/Computation Overlap
+* [Training Features](https://www.deepspeed.ai/features/#training-features)
+  * Simplified training API
+  * Gradient Clipping
+  * Automatic loss scaling with mixed precision
+* [Training Optimizers](https://www.deepspeed.ai/features/#training-optimizers)
+  * Fused Adam optimizer and arbitrary `torch.optim.Optimizer`
+  * Memory bandwidth optimized FP16 Optimizer
+  * Large Batch Training with LAMB Optimizer
+  * Memory efficient Training with ZeRO Optimizer
+  * CPU-Adam
+* [Training Agnostic Checkpointing](https://www.deepspeed.ai/features/#training-agnostic-checkpointing)
+* [Advanced Parameter Search](https://www.deepspeed.ai/features/#advanced-parameter-search)
+  * Learning Rate Range Test
+  * 1Cycle Learning Rate Schedule
+* [Simplified Data Loader](https://www.deepspeed.ai/features/#simplified-data-loader)
+* [Data Efficiency](https://www.deepspeed.ai/tutorials/data-efficiency/)
+  * Efficient data sampling via curriculum learning and efficient data routing via random layerwise token dropping
+  * Up to 2x data and 2x time saving during GPT-3/BERT pretraining and GPT/ViT finetuning
+  * Or further improve model quality under the same data/time
+* [Curriculum Learning](https://www.deepspeed.ai/tutorials/curriculum-learning/)
+  * A curriculum learning-based data pipeline that presents easier or simpler examples earlier during training
+  * Stable and 3.3x faster GPT-2 pre-training with 8x/4x larger batch size/learning rate while maintaining token-wise convergence speed
+  * Complementary to many other DeepSpeed features
+  * Note that the Data Efficiency Library above provides more general curriculum learning support. This legacy curriculum learning feature is still supported but we recommend to use the Data Efficiency Library.
+* [Progressive Layer Dropping](https://www.deepspeed.ai/2020/10/28/progressive-layer-dropping-news.html)
+  * Efficient and robust compressed training
+  * Up to 2.5x convergence speedup for pre-training
+* [Performance Analysis and Debugging](https://www.deepspeed.ai/features/#performance-analysis-and-debugging)
+* [Mixture of Experts (MoE)](https://www.deepspeed.ai/tutorials/mixture-of-experts/)
+
+
+---
+title: "Feature Overview"
+layout: single
+permalink: /features/
+toc: true
+toc_label: "Contents"
+---
+
+## Distributed Training with Mixed Precision
+
+### Mixed Precision Training
+Enable 16-bit (FP16) training by in the `deepspeed_config` JSON.
+```json
+"fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+}
+```
+
+### Single-GPU, Multi-GPU, and Multi-Node Training
+Easily switch between single-GPU, single-node multi-GPU, or multi-node multi-GPU
+execution by specifying resources with a hostfile.
+```bash
+deepspeed --hostfile=<hostfile> \
+	<client_entry.py> <client args> \
+	--deepspeed --deepspeed_config ds_config.json
+```
+The script `<client_entry.py>` will execute on the resources specified in
+[`<hostfile>`](/getting-started/#resource-configuration-multi-node).
+
+## Pipeline Parallelism
+DeepSpeed provides [pipeline parallelism](/tutorials/pipeline/) for memory-
+and communication- efficient training. DeepSpeed supports a hybrid
+combination of data, model, and pipeline parallelism and has scaled to over
+[one trillion parameters using 3D parallelism]({{ site.press_release_v3 }}).
+Pipeline parallelism can also improve communication efficiency and has
+accelerated training by up to 7x on low-bandwidth clusters.
+
+
+## Model Parallelism
+### Support for Custom Model Parallelism
+DeepSpeed supports all forms of model parallelism including tensor slicing
+based approaches such as the
+[Megatron-LM](https://github.com/NVIDIA/Megatron-LM). It does so by only
+requiring the model parallelism framework to provide a *model parallelism
+unit* (`mpu`) that implements a few bookkeeping functionalities:
+
+```python
+mpu.get_model_parallel_rank()
+mpu.get_model_parallel_group()
+mpu.get_model_parallel_world_size()
+
+mpu.get_data_parallel_rank()
+mpu.get_data_parallel_group()
+mpu.get_data_parallel_world_size()
+```
+
+### Integration with Megatron-LM
+DeepSpeed is fully compatible with [Megatron](https://github.com/NVIDIA/Megatron-LM).
+Please see the [Megatron-LM tutorial](/tutorials/megatron/) for details.
+
+
+
+
+## The Zero Redundancy Optimizer
+The Zero Redundancy Optimizer ([ZeRO](https://arxiv.org/abs/1910.02054)) is at
+the heart of DeepSpeed and enables large model training at a scale that is
+simply not possible with model parallelism alone. When enabled, ZeRO allows
+training models with over 13 billion parameters without any model parallelism,
+and up to 200 billion parameter models with model parallelism on current
+generation hardware.
+
+For more details see the [ZeRO paper](https://arxiv.org/abs/1910.02054), [GPT
+tutorial](/tutorials/megatron/) on integration with
+DeepSpeed.
+
+### Optimizer State and Gradient Partitioning
+Optimizer State and Gradient Partitioning in ZeRO reduces the memory consumption of the
+model states (optimizer states, gradients and parameters) by 8x compared to standard
+data parallelism by partitioning these states across data parallel process instead of
+replicating them.
+
+### Activation Partitioning
+Activation Partitioning is a memory optimization in ZeRO that can reduce the memory
+consumed by activations during model parallel training (MP). In MP certain
+activations maybe required by all MP processes, resulting in a replication of
+activations across MP GPUs. Activation Partitioning stores these activations in a
+partitioned state once they are used for computation in the forward propagation. These
+activations are allgathered right before they are needed again during the backward propagation.
+By storing activations in a partitioned state, ZeRO in DeepSpeed can reduce the activation
+memory footprint proportional to the MP degree.
+
+### Constant Buffer Optimization (CBO)
+CBO enables high network and memory throughput while restricting memory usage to a
+constant size. For memory- and network-bound operations such as normalization or
+allreduce collectives, the performance depends on the size of the operand. Simply fusing
+all operands into a single large operand can enable great throughput at the expense of
+unnecessary memory overhead. CBO in DeepSpeed fuses smaller operands into approximately a
+pre-defined sized buffer large enough to achieve great performance without the
+unnecessary memory overhead.
+
+### Contiguous Memory Optimization (CMO)
+CMO reduces memory fragmentation during training, preventing out of memory errors
+due to lack of contiguous memory. Memory fragmentation is a result of interleaving between
+short lived and long lived memory objects. During the forward propagation activation
+checkpoints are long lived but the activations that recomputed are short lived. Similarly,
+during the backward computation, the activation gradients are short lived while the parameter
+gradients are long lived. CMO transfers activation checkpoints and parameter gradients
+to contiguous buffers preventing memory fragmentation.
+
+## ZeRO-Offload
+
+ZeRO-Offload pushes the boundary of the maximum model size that can be trained efficiently using minimal GPU resources, by exploiting computational and memory resources on both GPUs and their host CPUs. It allows training up to 13-billion-parameter models on a single NVIDIA V100 GPU, 10x larger than the state-of-the-art, while retaining high training throughput of over 30 teraflops per GPU.
+
+For more details see the [ZeRO-Offload release blog]( https://www.microsoft.com/en-us/research/?p=689370&secret=iSlooB), and [tutorial](/tutorials/zero-offload/) on integration with DeepSpeed.
+
+## Additional Memory and Bandwidth Optimizations
+
+### Smart Gradient Accumulation
+Gradient accumulation allows running larger batch size with limited memory by breaking an
+effective batch into several sequential micro-batches, and averaging the parameter
+gradients across these micro-batches. Furthermore, instead of averaging the gradients of
+each micro-batch across all GPUs, the gradients are averaged locally during each step of
+the sequence, and a single `allreduce` is done at the end of the sequence to produce the
+averaged gradients for the effective batch across all GPUs. This strategy significantly
+reduces the communication involved over the approach of averaging globally for each
+micro-batch, specially when the number of micro-batches per effective batch is large.
+
+### Communication Overlapping
+During back propagation, DeepSpeed can overlap the communication required for averaging
+parameter gradients that have already been computed with the ongoing gradient computation.
+This computation-communication overlap allows DeepSpeed to achieve higher throughput even
+at modest batch sizes.
+
+## Training Features
+
+### Simplified training API
+The DeepSpeed core API consists of just a handful of methods:
+* initialization: `initialize`
+* training: `backward` and `step`
+* argument parsing: `add_config_arguments`
+* checkpointing : `load_checkpoint` and `store_checkpoint`
+
+DeepSpeed supports most of the features described in this document, via the use of these API,
+along with a `deepspeed_config` JSON file for enabling and disabling the features.
+Please see the [core API doc](https://deepspeed.readthedocs.io/) for more details.
+
+### Activation Checkpointing API
+
+DeepSpeed's Activation Checkpointing API supports activation checkpoint partitioning,
+cpu checkpointing, and contiguous memory optimizations, while also allowing layerwise
+profiling. Please see the [core API doc](https://deepspeed.readthedocs.io/) for more details.
+
+
+### Gradient Clipping
+```json
+{
+  "gradient_clipping": 1.0
+}
+```
+DeepSpeed handles gradient clipping under the hood based on the max gradient norm
+specified by the user.
+Please see the [core API doc](https://deepspeed.readthedocs.io/) for more details.
+
+### Automatic loss scaling with mixed precision
+DeepSpeed internally handles loss scaling for mixed precision training. The parameters
+for loss scaling can be specified in the `deepspeed_config` JSON file.
+Please see the [core API doc](https://deepspeed.readthedocs.io/) for more details.
+
+## Training Optimizers
+
+### 1-bit Adam, 0/1 Adam and 1-bit LAMB optimizers with up to 26x less communication
+
+DeepSpeed has three communication-efficient optimizers called 1-bit Adam, 0/1 Adam and 1-bit LAMB.
+They offer the same convergence as Adam/LAMB, incur up to 26x less communication that enables
+up to 6.6x higher throughput for BERT-Large pretraining and up to 2.7x higher throughput
+for SQuAD fine-tuning on bandwidth-limited clusters. For more details on usage and performance,
+please refer to the [1-bit Adam tutorial](https://www.deepspeed.ai/tutorials/onebit-adam),
+[1-bit Adam blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.md),
+[0/1 Adam tutorial](https://www.deepspeed.ai/tutorials/zero-one-adam)
+and [1-bit LAMB tutorial](https://www.deepspeed.ai/tutorials/onebit-lamb/). For technical details,
+please refer to the [1-bit Adam paper](https://arxiv.org/abs/2102.02888), [0/1 Adam paper](https://arxiv.org/abs/2202.06009) and
+[1-bit LAMB paper](https://arxiv.org/abs/2104.06069).
+
+### Fused Adam optimizer and arbitrary torch.optim.Optimizer
+With DeepSpeed, the user can choose to use a high performance implementation of ADAM from
+NVIDIA, or any training optimizer that extends torch's `torch.optim.Optimizer` class.
+
+### CPU-Adam: High-Performance vectorized implementation of Adam
+We introduce an efficient implementation of Adam optimizer on CPU that improves the parameter-update
+performance by nearly an order of magnitude. We use the AVX SIMD instructions on Intel-x86 architecture
+for the CPU-Adam implementation. We support both AVX-512 and AVX-2 instruction sets. DeepSpeed uses
+AVX-2 by default which can be switched to AVX-512 by setting the build flag, `DS_BUILD_AVX512` to 1 when
+installing DeepSpeed. Using AVX-512, we observe 5.1x to 6.5x speedups considering the model-size between
+1 to 10 billion parameters with respect to torch-adam.
+
+### Memory bandwidth optimized FP16 Optimizer
+Mixed precision training is handled by the DeepSpeed FP16 Optimizer. This optimizer not
+only handles FP16 training but is also highly efficient. The performance of weight update
+is primarily dominated by the memory bandwidth, and the achieved memory bandwidth is
+dependent on the size of the input operands. The FP16 Optimizer is designed to maximize
+the achievable memory bandwidth by merging all the parameters of the model into a single
+large buffer, and applying the weight updates in a single kernel, allowing it to achieve
+high memory bandwidth.
+
+### Large Batch Training with LAMB Optimizer
+<!-- **TODO: port tutorial** -->
+DeepSpeed makes it easy to train with large batch sizes by enabling the LAMB Optimizer.
+For more details on LAMB, see the [LAMB paper](https://arxiv.org/pdf/1904.00962.pdf).
+
+### Memory-Efficient Training with ZeRO Optimizer
+DeepSpeed can train models with up to 13 billion parameters without model parallelism, and
+models with up to 200 billion parameters with 16-way model parallelism. This leap in
+model size is possible through the memory efficiency achieved via the ZeRO Optimizer. For
+more details see [ZeRO paper](https://arxiv.org/abs/1910.02054) .
+
+
+
+## Training Agnostic Checkpointing
+DeepSpeed can simplify checkpointing for you regardless of whether you are using data
+parallel training, model parallel training, mixed-precision training, a mix of these
+three, or using the zero optimizer to enable larger model sizes.
+Please see the [Getting Started](/getting-started/) guide
+and the [core API doc](https://deepspeed.readthedocs.io/) for more details.
+
+## Advanced parameter search
+DeepSpeed supports multiple Learning Rate Schedules to enable faster convergence for
+large batch scaling.
+
+### Learning Rate Range Test
+Please refer to the [Learning Rate Range Test](/tutorials/lrrt/) tutorial.
+
+### 1Cycle Learning Rate Schedule
+Please refer to the [1Cycle Learning Rate Schedule](/tutorials/1Cycle/) tutorial.
+
+
+## Simplified Data Loader
+DeepSpeed abstracts away data parallelism and model parallelism from the user when it
+comes to data loading. Users simply provide a PyTorch dataset, and DeepSpeed data loader
+can automatically handle batch creation appropriately.
+
+## Data Efficiency
+Please refer to the [Data Efficiency](/tutorials/data-efficiency/) tutorial.
+
+## Curriculum Learning
+Please refer to the [Curriculum Learning](/tutorials/curriculum-learning/) tutorial. Note that the Data Efficiency Library above provides more general curriculum learning support. This legacy curriculum learning feature is still supported but we recommend to use the Data Efficiency Library.
+
+## Performance Analysis and Debugging
+
+DeepSpeed provides a set of tools for performance analysis and debugging.
+
+### Wall Clock Breakdown
+
+DeepSpeed provides a detailed breakdown of the time spent
+in different parts of the training.
+This can be enabled by setting the following in the `deepspeed_config` file.
+
+```json
+{
+  "wall_clock_breakdown": true,
+}
+
+```
+
+###  Timing Activation Checkpoint Functions
+
+When activation checkpointing is enabled, profiling the forward and backward time of each checkpoint function can be enabled in the `deepspeed_config` file.
+
+```json
+{
+  "activation_checkpointing": {
+    "profile": true
+  }
+}
+
+```
+
+### Flops Profiler
+
+The DeepSpeed flops profiler measures the time, flops and parameters of a PyTorch model and shows which modules or layers are the bottleneck. When used with the DeepSpeed runtime, the flops profiler can be configured in the `deepspeed_config` file as follows:
+
+```json
+{
+  "flops_profiler": {
+    "enabled": true,
+    "profile_step": 1,
+    "module_depth": -1,
+    "top_modules": 3,
+    "detailed": true,
+    }
+}
+
+```
+The flops profiler can also be used as a standalone package. Please refer to the [Flops Profiler](/tutorials/flops-profiler) tutorial for more details.
+
+
+### Autotuning
+
+The DeepSpeed Autotuner  uses model information, system information, and heuristics to efficiently tune Zero stage, micro batch size, and other Zero configurations. Using the autotuning feature requires no code change from DeepSpeed users. While `"autotuning": {"enabled": true}` is the minimal required to enable auotuning, there are other parameters users can define to configure the autotuning process. Below shows major parameters and their default values in the autotuning configuration. Please refer to the [Autotuning](/tutorials/autotuning) tutorial for more details.
+
+```json
+{
+  "autotuning": {
+    "enabled": true,
+    "results_dir": null,
+    "exps_dir": null,
+    "overwrite": false,
+    "metric": "throughput",
+    "num_nodes": null,
+    "num_gpus": null,
+    "start_profile_step": 3,
+    "end_profile_step": 5,
+    "fast": true,
+    "num_tuning_micro_batch_sizes": 3,
+    "tuner_type": "model_based",
+    "tuner_early_stopping": 5,
+    "tuner_num_trials": 50,
+    "arg_mappings": null
+  }
+}
+
+```
+The flops profiler can also be used as a standalone package. Please refer to the [Flops Profiler](/tutorials/flops-profiler) tutorial for more details.
+
+### Monitor
+
+The DeepSpeed Monitor logs live training metrics to one or more monitoring backends, including PyTorch's [TensorBoard](https://pytorch.org/docs/1.8.0/tensorboard.html), [WandB](https://docs.wandb.ai/quickstart), or simply to CSV files. The Monitor can be configured with one or more backends in the `deepspeed_config` file as follows:
+
+```json
+{
+  "tensorboard": {
+    "enabled": true,
+    "output_path": "output/ds_logs/",
+    "job_name": "train_bert"
+  }
+  "wandb": {
+    "enabled": true,
+    "team": "my_team",
+    "group": "my_group",
+    "project": "my_project"
+  }
+  "csv_monitor": {
+    "enabled": true,
+    "output_path": "output/ds_logs/",
+    "job_name": "train_bert"
+  }
+}
+
+```
+
+The Monitor can also be added to log custom metrics and client codes. Please refer to the [Monitor](/tutorials/monitor) tutorial for more details.
+
+### Communication Logging
+
+DeepSpeed provides logging of all communication operations launched within `deepspeed.comm`. The communication logger can be configured in the `deepspeed_config` file as follows:
+
+```json
+{
+  "comms_logger": {
+    "enabled": true,
+    "verbose": false,
+    "prof_all": true,
+    "debug": false
+  }
+}
+
+```
+
+Client codes can then print a summary with a call to `deepspeed.comm.log_summary()`. For more details and example usage, see the [Communication Logging](/tutorials/comms-logging) tutorial.
+
+## Sparse Attention
+DeepSpeed offers sparse attention to support long sequences. Please refer to the [Sparse Attention](/tutorials/sparse-attention/) tutorial.
+
+```bash
+--deepspeed_sparse_attention
+```
+
+```json
+"sparse_attention": {
+    "mode": "fixed",
+    "block": 16,
+    "different_layout_per_head": true,
+    "num_local_blocks": 4,
+    "num_global_blocks": 1,
+    "attention": "bidirectional",
+    "horizontal_global_attention": false,
+    "num_different_global_patterns": 4
+}
+```
+
+## Mixture of Experts (MoE)
+To learn more about training Mixture of Experts (MoE) models with DeepSpeed, see our [tutorial](https://www.deepspeed.ai/tutorials/mixture-of-experts/) for more details.
diff --git a/docs/_posts/2020-09-09-ZeRO-Offload.md b/docs/_posts/2020-09-09-ZeRO-Offload.md
old mode 100644
new mode 100755
diff --git a/docs/_posts/2020-10-28-progressive-layer-dropping-news.md b/docs/_posts/2020-10-28-progressive-layer-dropping-news.md
old mode 100644
new mode 100755
diff --git a/docs/_posts/2022-03-21-amd-support.md b/docs/_posts/2022-03-21-amd-support.md
index 0c9ca3fb68d0977c52b2ad11ba2e53a1020d2b91..ba8917bc386a80ba1fec6a83036d36fe87babb9e 100644
--- a/docs/_posts/2022-03-21-amd-support.md
+++ b/docs/_posts/2022-03-21-amd-support.md
@@ -3,5 +3,5 @@ title: "Supporting efficient large model training on AMD Instinct GPUs with Deep
 excerpt: ""
 link: https://cloudblogs.microsoft.com/opensource/2022/03/21/supporting-efficient-large-model-training-on-amd-instinct-gpus-with-deepspeed/
 date: 2022-03-21 00:00:00
-tags: training inference ZeRO
+tags: training ZeRO
 ---
diff --git a/docs/_posts/2022-07-26-deepspeed-azure.md b/docs/_posts/2022-07-26-deepspeed-azure.md
new file mode 100644
index 0000000000000000000000000000000000000000..128cbf4a416ecba88ac9269f2ce4dd073883b009
--- /dev/null
+++ b/docs/_posts/2022-07-26-deepspeed-azure.md
@@ -0,0 +1,135 @@
+---
+title: "Azure empowers easy-to-use, high-performance, and hyperscale model training using DeepSpeed"
+excerpt: ""
+date: 2022-07-26 00:09:00
+tags: training azure
+---
+
+## Introduction
+
+Large-scale transformer-based deep learning models trained on large amounts of data have shown great results in recent years in several cognitive tasks and are behind new products and features that augment human capabilities. These models have grown several orders of magnitude in size during the last five years. Starting from a few million parameters of the original transformer model all the way to the latest 530 billion-parameter Megatron-Turing model as shown in *Figure 1*. There is a growing need for customers to train and fine tune large models at an unprecedented scale.
+
+![Large Models](/assets/images/large-model-graph.png){: .align-center}
+
+*Figure 1: Landscape of large models and hardware capabilities*
+
+To train these models, users needed to set up and maintain a complex distributed training infrastructure that usually required several manual and error-prone steps. These lead to a subpar experience both in terms of usability and performance. We recently [announced](https://azure.microsoft.com/en-us/blog/azure-empowers-easytouse-highperformance-and-hyperscale-model-training-using-deepspeed/) how we are making great strides to simplify this and enable easy-to-use and high-performance training at 1K+ GPU scale on Azure.
+
+In this extended post, we share the details of how DeepSpeed users can train trillion-parameter models with a new easy-to-use, streamlined, scalable, and high-performance distributed training experience on Azure. We also share details of the experimental setup, model configurations, additional performance trends, and guide our users on how to run these experiments in their own environments.
+
+## Making distributed training faster and easier on Azure using DeepSpeed
+
+We compare the existing manual and error-prone workflow with our proposed easy-to-use workflow for DeepSpeed on Azure in *Figure 2*. Customers can now use easy-to-use [training pipelines](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples) to launch training jobs at scale. The new workflow reduces the number of steps from 11 to just 1 if users rely on the recommended [AzureML](https://azure.microsoft.com/en-us/services/machine-learning/) [recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azureml).
+
+
+![Workflow](/assets/images/old-vs-new-azure.png){: .align-center}
+
+*Figure 2: An easy-to-use and streamlined distributed training experience with DeepSpeed on Azure*
+
+For users who have custom environments built using Azure VMs or [Azure VMSS](https://docs.microsoft.com/en-us/azure/virtual-machine-scale-sets/overview), only two steps are needed:
+
+- 1) Run the cluster setup script (to be released in the next few weeks)
+- 2) Use the Azure VMSS [recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azure) to launch training.
+
+## Key Performance Benefits
+We already shared a summary of our key performance results in the Azure [announcement](https://azure.microsoft.com/en-us/blog/azure-empowers-easytouse-highperformance-and-hyperscale-model-training-using-deepspeed/). We enable the capability to train 2x larger model sizes (2 trillion vs. 1 trillion parameters), scale to 2x more GPUs (1024 vs. 512), and offer up to 1.8x higher compute throughput/GPU (150 TFLOPs vs. 81 TFLOPs) compared to other [cloud providers](https://medium.com/pytorch/training-a-1-trillion-parameter-model-with-pytorch-fully-sharded-data-parallel-on-aws-3ac13aa96cff).
+
+DeepSpeed on Azure offers near-linear scalability both in terms of **increase in model size** as well as **increase in number of GPUs**. As shown in *Figure 3a*, together with the DeepSpeed [ZeRO-3](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/), its novel CPU offloading capabilities, and a high-performance Azure stack powered by InfiniBand interconnects and A100 GPUs, we were able to maintain an efficient throughput/GPU (>157 TFLOPs) in a near-linear fashion as the model size increases from 175 billion parameters to 2 trillion parameters. On the other hand, for a given model size, e.g., 175B, we achieve near-linear scaling as we increase the number of GPUs from 128 all the way to 1024 as shown in *Figure 3b*. The key takeaway is that Azure and DeepSpeed together are breaking the GPU memory wall and enabling our customers to easily and efficiently train trillion-parameter models at scale.
+
+![Perf-overview](/assets/images/perf-overview.png){: .align-center}
+
+*Figure 3: (a) Near-perfect throughput/GPU as we increase the model size from 175 billion to 2 trillion parameters (BS/GPU=8). (b) Near-perfect performance scaling with the increase in number of GPU devices for the 175B model (BS/GPU=16). The sequence length is 1024 for both cases.*
+
+## Experimental Setup
+We share the details of our experimental setup and some of the best practices we followed. The users can either directly use them to reproduce our results or modify them to fit their own setup in terms of model scale as well as the scale of Azure hardware being provisioned.
+
+### Hardware (Azure instances)
+
+We used [NDm A100 v4-series](https://docs.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series) instances in our experiments. Each instance includes two socket AMD EPYC 7V12 64-Core CPUs, 1.7TB main memory and eight A100 80GB GPUs. The system has a balanced PCIe topology connecting 4 GPU devices to each CPU socket. Each GPU within the VM is provided with its own dedicated, topology-agnostic 200 Gb/s NVIDIA Mellanox HDR InfiniBand connection providing an accelerated 200 Gbps high speed fabric. The DeepSpeed library exploits offload capabilities where the activation and optimizer states are allocated in the main memory. Hence, 1.7TB memory capacity per node helps us to scale to large model sizes.
+
+### Training setup using AzureML
+Users can directly use the AzureML studio and use our published [recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azureml) to run experiments without any additional setup. This is the easiest and recommended way of running experiments on Azure.
+
+### Training setup using Azure VMSS
+
+Existing VMSS customers and others who have custom Azure VM based environments can follow the setup as follows. The scripts to make these steps easy will be released in the coming weeks.
+A cluster is created using Azure Virtual Machine Scale Sets (VMSS) to provision the desired number of compute nodes running the new Azure HPAI VM image specialized for extreme-scale deep learning applications using the software stack listed in *Table 1*.
+
+| Name | Description (Version) |
+| ------------------------------:  | :----------------: |
+| PyTorch | 	1.10.2 (installed from source) |
+| DeepSpeed |	0.6.2 (installed from source) |
+| Megatron-LM |	[https://github.com/microsoft/Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) |
+| Apex |	0.1 |
+| NCCL |	2.12.10 |
+| CUDNN |	8.2.4.15 |
+| CUDA |	11.4 |
+| CUDA Driver |	R470.82 |
+| VM Image |	Ubuntu-HPC 20.04 Image |
+
+*Table 1: Detailed version information of the software packages in the Azure HPC VM image*
+
+Users can create a VMSS with up to 600 VM instances enabling up to 4,800 A100 GPUs.  In addition to the VMSS for the compute nodes, we provision a distinct login node using an inexpensive D4s v4 (or similar) instance with 4-core Intel VCPU, running the same image, for compiling, launching, and monitoring jobs.  The login node, compute nodes, and a shared storage filesystem are grouped within an Azure Virtual Network (vnet) allowing VMs to connect to each other over SSH and to shared NFS volume shown in *Figure 4*.
+
+![VMSS-overview](/assets/images/vmss-setup.png){: .align-center}
+
+*Figure 4: Organization of our VMSS-based experimental setup*
+
+## Performance Evaluation on Various Model Configurations
+
+We ran our experiments with four different model sizes – 175B, 530B, 1T, and 2T – using the configurations shown in *Table 2*.
+
+| Model Size   | 175B | 530B | 1T   | 2T   |
+| :---------:  | ---: | ---: | ---: | ---: |
+| Number of layers	| 96 | 105 | 128 | 160 |
+| Hidden Dimension	| 12,288 | 20,480 | 25,600 | 32,768 |
+| Attention Heads	| 96 | 128 | 160 | 128 |
+
+*Table 2: Model configuration*
+
+For each of these configurations, we report peak throughput of the system using TFLOPs/GPU as the main performance metric. To calculate TFLOPs, we use the formula used by the Megatron paper as shown below.
+
+```FLOPs/GPU = 96 * B * s * l * h2 * (1 + s/6h + V/(16*l*h))```
+
+B is batch size, s is sequence length, l is the number of layers, h is hidden size, and V is vocabulary size.
+
+### Scaling the 175B and 530B models
+*Figures 5a* and *5b* show the results of 175B model with sequence length 512 and 1024, respectively. We only scale to 512 GPUs for seq-length 512 as adding more GPUs shows similar performance. On the other hand, with sequence length 1024, we saw linear performance increase to 1024 GPUs. Overall, the peak throughput of **204.49 TFLOPs/GPU** was achieved on 256 GPUs with a micro batch size of 32 and sequence length of 512.
+
+![175b-overview](/assets/images/175b-trend.png){: .align-center}
+
+*Figure 5: Performance characteristics of 175B model on 512 and 1K GPUs respectively. The colored columns signify different micro batch sizes.*
+
+Next, we report the 530B model scaling. Previous results on the 530B MT-NLG model using DeepSpeed and Megatron-LM on 280 DGX A100 servers on the Selene supercomputer showed the peak throughput of 126 TFLOPS/GPU. However, we were able to surpass that throughput and achieved up to **171.37 TFLOPs/GPU** on 128 NDm A100 v4-series A100 systems (i.e., 1024 GPUs) as shown in *Figure 6*.
+
+The benefit of this 530B model is its simpler parallelization configuration as there is no tensor/pipeline parallelism. With ZeRO powered data parallelism, there are fewer heuristics required to optimally configure the distributed model. In addition, the consistent steady state performance of more than 140 TFLOPs/GPU for micro batch sizes >1 demonstrates a robust software and hardware platform.
+
+![530b-overview](/assets/images/530b-trend.png){: .align-center}
+
+*Figure 6: Throughput achieved with a 530B parameter model on 512 and 1024 GPUs for micro-batch sizes per GPU of 1, 2, 4, and 8, with sequence length 1,024.*
+
+### Scaling the 1T and 2T models
+
+The 1T parameter model contains 128 layers with 160 attention heads. Training such an extreme-scale model is not an easy task. *Figure 7* shows the throughput achieved for each of the model configurations we explored on 512 and 1024 GPUs. Peak throughput achieved was **165.36 TFLOPs/GPU** for micro batch size of 8 across 1024 GPUs and the model reached steady state performance within the first 3-4 iterations.
+
+![1t-overview](/assets/images/1t-trend.png){: .align-center}
+
+*Figure 7: Performance characteristics of 1T parameter model on 512 and 1024 GPUs with 1, 2, 4, and 8 micro batch sizes, with sequence length 1,024.*{: .align-center}
+
+The 2T parameter model consists of 160 layers, 32k hidden dimension, and 128 attention heads. Given the large size of the model and the significant time required on 1024 GPUs, we limited our benchmark runs for the 2T model to a batch size of 8 per GPU with a sequence length of 1024. We were able to achieve 157 TFLOPs/GPU on 1,024 GPUs.
+
+## How to run training experiments on Azure?
+
+We recognize that DeepSpeed users are diverse and have different environments. In this tutorial, our focus is on making things simpler for users who plan to run large model training experiments on Azure.
+
+> The easiest way to do model training on Azure is via the Azure ML recipes. The job submission and data preparation scripts have been made available [here](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azureml). Users simply need to setup their Azure ML workspace following the [guide](https://github.com/Azure/azureml-examples/tree/main/python-sdk#set-up) and submit experiment using the aml_submit.py file.
+
+Some users have customized environments built on top of Azure VMs and VMSS based clusters. To simplify training on such setups, we are working on an easy-to-use cluster setup script that will be published in the next few weeks. If you already have a cluster setup running, you can use the [azure recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azure) for the 175B and the 1T model. The recipes can easily be modified to train other model configurations.
+
+## Acknowledgement
+
+This blog post was written by the DeepSpeed team in collaboration with the AzureML and the AzureHPC team. We would like to acknowledge several individuals who made this work possible:
+- AzureHPC team: Russell J. Hewett, Kushal Datta, Prabhat Ram, Jithin Jose, and Nidhi Chappell
+- AzureML team: Vijay Aski, Razvan Tanase, Miseon Park, Savita Mittal, Ravi Shankar Kolli, Prasanth Pulavarthi, and Daniel Moth
+- DeepSpeed team: Ammar Ahmad Awan, Jeff Rasley, Samyam Rajbhandari, Martin Cai, and Yuxiong He
+- CTO office: Gopi Kumar and Luis Vargas
diff --git a/docs/_posts/2022-09-10-zero-inference.md b/docs/_posts/2022-09-10-zero-inference.md
new file mode 100644
index 0000000000000000000000000000000000000000..dd718b9f883906291326a18a18ad7ec54529f6dd
--- /dev/null
+++ b/docs/_posts/2022-09-10-zero-inference.md
@@ -0,0 +1,122 @@
+---
+title: "ZeRO-Inference: Democratizing massive model inference"
+excerpt: ""
+date: 2022-09-10 00:09:00
+tags: inference ZeRO
+---
+
+## Introduction
+The current trends in artificial intelligence (AI) domains such as image, speech, and natural language, demonstrate that model quality can be improved by increasing model size. In natural language processing, for example, the state-of-the-art (SOTA) model has grown from 300 million parameters (Bert-Large) to 500 billion parameters (Megatron-Turing-530B) in less than four years. However, this dramatic growth in model sizes has significantly increased the GPU cost to train, finetune or inference these models, making them unaffordable to most users. To democratize access to AI innovations, large organizations, such as Hugging Face (BigScience), Meta, and Yandex have recently publicly released pre-trained massive models. Unfortunately, even these publicly available models are not broadly usable because many users cannot afford the dozens of GPUs required to fit them for inference computation. For example, half-precision inference computation on Megatron-Turing-530B (SOTA model for natural language) requires at least 40 A100-40GB GPUs, which is unaffordable to many students, model scientists, hobbyists, and small businesses that could benefit from using these powerful models. And so, a real concern is that if the dramatic increase in model sizes continues, then a growing fraction of users could be excluded from the benefits of these AI innovations.
+
+DeepSpeed, a part of Microsoft’s AI at Scale Initiative, has developed the ZeRO-Inference technology to address these obstacles to AI democratization. ZeRO-Inference comes from the family of ZeRO technologies, which are a collection of powerful memory and parallelism optimizations for efficient large scale model training and inference on modern GPU clusters. DeepSpeed had previously developed ZeRO-Infinity, a technology that leverages heterogeneous memory (GPU, CPU, and NVMe) to efficiently scale model training to extreme levels. ZeRO-Inference adapts and optimizes ZeRO-Infinity techniques for model inference on GPUs by hosting the model weights in CPU or NVMe memory, thus hosting no (**zero**) weights in GPU. This approach is inspired by the observation that the aggregate capacity of CPU and NVMe memories in most commodity computing devices (e.g., laptops, desktops, workstations, etc.) is on the order of terabytes and sufficient to host the largest known models for inference computation. By leveraging this non-GPU memory, ZeRO-Inference enables inference computation of massive models (with hundreds of billions of parameters) on as few as a single GPU, thereby making massive model inference accessible to almost everyone.  Moreover, by dramatically reducing GPU memory requirements with CPU or NVMe memory which are significantly cheaper, it significantly reduces the cost of massive model inference, offering an affordable inference path to SOTA models.
+
+## How ZeRO-Inference works
+The massive computational requirements of large model inference means that accelerators like GPUs are required for efficient execution. Therefore, an important design decision for large model inference on limited GPU budget is how to apportion GPU memory among model weights, inference inputs, and intermediate results.
+
+### Offload all model weights
+ZeRO-Inference pins the entire model weights in CPU or NVMe (whichever is sufficient to accommodate the full model) and streams the weights layer-by-layer into the GPU for inference computation. After computing a layer, the outputs are retained in GPU memory as inputs for the next layer, while memory consumed by the layer weights is released for use by the next layer.  Thus, model inference time is composed of the time to compute the layers on GPU, and the time to fetch the layers over PCIe. For large model inference, this approach provides scaling and efficiency benefits, as explained below.
+
+ZeRO-Inference offers scaling benefits in two ways. First, by keeping just one (or a few) model layers in GPU memory at any time, ZeRO-Inference significantly reduces the amount of GPU memory required to inference massive models. For current SOTA models which have about a hundred layers (e.g., 96 and 105 layers in GPT3-175B and Megatron-Turing-530B respectively), ZeRO-Inference reduces the GPU memory requirements by up to two orders of magnitude. For example, with ZeRO-Inference, GPU memory consumption of Megaton-Turing-530B for half-precision inference drops from 1TB to 10GB.  Second, by fitting the model into CPU or NVMe memory which are orders of magnitude cheaper than GPU memory, ZeRO-Inference makes scaling to future SOTA models (e.g., with trillions or tens-of-trillions of parameters) more affordable compared to approaches that fit the entire model into GPU memory.
+
+ZeRO-Inference delivers efficient computation for throughput-oriented inference applications despite the latency of fetching model weights from CPU or NVMe over PCIe interconnect. The primary reason for this is that by limiting GPU memory usage of the model to one or a few layers of weights, ZeRO-Inference can use the majority of GPU memory to support a large amount of input tokens in the form of long sequences or large batch sizes.  A large model layer requires a significant amount of computation, especially when processing inputs with many input tokens. For example, one GPT3-175B layer requires about 7 TFlops to process an input of batch size 1 and sequence length of 2048. Therefore, for inference scenarios with long sequence length and large batch sizes, the computation time dominates the latency of fetching model weights, which ultimately improves efficiency. In summary, ZeRO-Inference's strategy to utilize GPU memory to support large number of input tokens results in high performance inference for large models.
+
+### Optimizations
+To further improve system efficiency, ZeRO-Inference leverages two additional optimizations to reduce the latency of fetching layer weights from CPU or NVMe memory into GPU memory.
+
+The first optimization involves overlapping the fetch of a layer with the computation of an earlier layer, a.k.a., layer prefetching. Layer prefetching allows ZeRO-Inference to hide portions of the transfer latency of the prefetched layers. This is especially useful when computation time is not large enough or cannot be sufficiently increased (e.g., with larger batch size) to dominate the latency of fetching layer weights.
+
+The second optimization, which is applicable for inference on multiple GPUs, involves parallelizing the fetch of each layer across multiple GPUs by using each GPU to fetch only a portion of the layer. Employing the aggregate PCIe links of the GPUs in this manner essentially increases the transfer bandwidth linearly, thus reducing the latency. With this approach, fetching layers into GPU memory occurs in two phases. First, each GPU independently fetches a partition of the layer over PCIe into its memory. At this point, only a partition of the layer will be resident on each GPU.  Next, each GPU assembles the full layer for computation by fetching the missing layer pieces from other GPUs over the high-bandwidth GPU-GPU interconnect (e.g., NVLink, xGMI, etc.). Since GPU-GPU interconnect bandwidth is typically over an order of magnitude higher than PCIe bandwidth, efficient multi-GPU or multi-node communication primitives, such as NCCL or RCCL all-gather, can be used to efficiently assemble the full layer on all GPUs with negligible latency compared to the PCIe latency.
+
+### Alternative approach: Host some model weights in GPU memory
+An alternative approach to ZeRO-Inference is to pin as many of the model weights as possible into GPU memory and fetch the remainder (from CPU or NVMe) when needed for computation. A benefit of this approach is avoidance of the latency of fetching weights that are already pinned in GPU memory. However, this approach has two downsides: (i) the latency savings for hundred-billion parameter models are negligible since only a small fraction of the weights can fit in GPU memory, and (ii) even when a decent portion of the model weights can fit (e.g., > 50% for ~10B models), the remaining GPU memory can only fit small batch sizes which hurts inference throughput. We later show evaluation results to demonstrate that this approach is sub-optimal.
+
+
+## Model Scaling on 1 GPU
+ZeRO-Inference enables significant model scaling for inference on a single GPU compared to a baseline that hosts the model in GPU memory (i.e., HBM). As an example, we consider half-precision model inference using a single NVIDIA Tesla V100 GPU in a NVIDIA DGX2 system. While the V100 GPU has 32GB of memory, the system is equipped with 1.5TB of CPU DRAM and 30TB of NVMe storage. The maximum model size supported for inference computation on GPU depends on the memory in which the model is hosted. *Figure 1* below shows the achievable model scales in this system for GPU inference with ZeRO-Inference. In comparison, the baseline cannot support models larger than 16 billion parameters for GPU inference[^model_scale]. In contrast, ZeRO-Inference has the flexibility to host the model in a different memory (DRAM or NVMe) than HBM. This flexibility allows ZeRO-Inference to support much larger models than baseline.  For example, by hosting a model on NVMe memory, Zero-Inference can support models with up to 15 trillion parameters for GPU inference, which is almost a thousand times larger compared to baseline. A practical takeaway from *Figure 1* is that ZeRO-Inference enables single GPU inference computation of current SOTA models, since they are smaller than 15 trillion parameters.
+
+![Model-Scaling](/assets/images/zero_inference_model_scale.png){: .align-center}
+
+[^model_scale]: 16 billion parameters model won’t fit in V100-32GB for half-precision inference since no memory will be left for inputs and intermediate results.
+
+## Token Generation Performance
+An important inference workload is token generation based on an input prompt. In this workload the model is provided a text sequence as input prompt, and based on this prompt, the model generates output text of configurable length. We use this workload to demonstrate the performance of ZeRO-Inference. This workload consists of two phases: (1) the prompt processing phase where the model processes the input prompt, and (2) the generation phase where the model generates the output tokens.
+
+ZeRO-Inference is targeted for throughput-oriented inference applications, and so the performance metric that we use for this workload is the number of tokens generated per second in the generation phase. We use the Hugging Face token generation pipeline in our experiments to measure the performance of using a greedy search algorithm to generate ten output tokens given an input prompt of four tokens. The generation pipeline in our experiments uses KV-caching optimization to improve performance by caching generated tokens to avoid re-computation. We consider the performance impact of three aspects of ZeRO-Inference design choices and optimizations: (1) full offloading model weights as opposed to partial offloading, (2) prefetching layer weights ahead of use, and (3) using multiple GPUs to parallelize layer fetching over PCIe. Additionally, we measure the performance impact of varying the number of output tokens.
+
+### Models
+For our experiments, we use the three publicly available massive language models listed in *Table 1*. We configure these models for half-precision inference computations. ZeRO-Inference is required to inference these models on a single V100-32GB since they are bigger than GPU memory.
+
+![Public-models](/assets/images/zero_inference_models.png){: .align-center}
+
+### Full Offload vs. Partial Offload of model weights
+A key design choice in ZeRO-Offload is to offload all the weights of models larger than GPU memory rather than host a subset of the weights in GPU memory. Our intuition for this approach is that for throughput-oriented inference applications, the larger batch sizes enabled by full offload yields better performance than partial offload. In *Table 2*, we present results for OPT-30B token generation on a single V100-32GB that compare fully offloading the model weights versus hosting a portion (i.e., 10 and 12 billion parameters[^partial_offload]) in GPU memory. The results show that full offload delivers the best performance for both CPU memory (43 tokens per second) and NVMe memory (30 tokens per second).  With both CPU and NVMe memory, full offload is over 1.3x and 2.4x faster than partial offload of 18 and 20 billion parameters respectively. The performance advantage of full offload comes from the larger batch sizes compared to the partial offload options. **Thus when a model does not fit in GPU, using GPU memory to increase batch size rather than to partially fit the model leads to faster token generation.**
+
+![Full-offload](/assets/images/zero_inference_full_offload.png){: .align-center}
+
+[^partial_offload]: Pinning more parameters in GPU memory resulted in out of memory errors for small batch sizes.
+
+### Prefetching layer weights
+ZeRO-Inference fetches layers ahead of use, overlapping with current layer computation, to hide layer transfer latency. We measure the impact of prefetching on token generation performance on a single V100-32GB and summarize the results in *Table 3*. We observe that prefetching did not improve CPU offload. This is because the relatively short sequences in token generation (i.e., less than 50 tokens) resulted in layer computation time that is insufficient to hide a significant portion of layer fetch time from CPU.  In contrast, prefetching improves NVMe offloading performance by 1.13x, 1.14x and 1.21x for OPT-30B, OPT-175B, and BLOOM-176B respectively.  This is because transferring weights from NVMe through CPU memory allows prefetching to overlap transfers from CPU to GPU memory with transfers from NVMe to CPU boosting the effective transfer bandwidth.
+
+![Prefetch-Layer](/assets/images/zero_inference_prefetch.png){: .align-center}
+
+### Parallelizing layer fetching on multiple GPUs
+ZeRO-Inference leverages the four PCIe interconnects between GPUs and CPU memory to parallelize layer fetching for faster inference computations on multiple GPUs. In *Table 4*, we report the throughput improvements for token generation on two and four GPUs compared to a single GPU[^multi_gpu_pcie] . These results were collected with layer prefetching enabled. The reported throughput numbers are per GPU showing that token generation becomes faster on each GPU as the aggregated PCIe links reduce the layer fetch latencies. **The improved per GPU throughput translates to super-linear scaling performance**. Additionally, these results suggest improved bandwidths of future PCIe generations could help to improve ZeRO-Inference performance.
+
+![Multi-GPU](/assets/images/zero_inference_multi_gpu.png){: .align-center}
+
+[^multi_gpu_pcie]: For multiple GPU runs, we select GPUs with independent PCIe interconnects to CPU memory.
+
+
+### Impact of generation output length
+We measure the performance impact of the number of output tokens since the memory overhead of KV-caching optimization increases with longer output tokens and could limit batch size. First, we consider the impact of token lengths 10, 20, 50, and 100 on batch size that can fit one V100-32GB GPU. The results in *Table 5* show a 2X reduction in batch size for a 5X increase in token count (compared to baseline count of 10).
+
+![Token-count-batch-size](/assets/images/zero_inference_token_count_batch_size.png){: .align-center}
+
+Next, we measure the impact on generation throughput using four V100-32GB GPUs. The results are presented in Table 6 for CPU offload, and Table 7 for NVMe-Offload. We observe an impact that is consistent across models and offload memory, which is that increasing the number of output tokens reduces throughput proportionally to batch size reduction. These results also demonstrate the importance of large batch sizes to the performance of ZeRO-Inference.
+
+![Token-count-cpu-throughput](/assets/images/zero_inference_token_count_cpu_throughput.png){: .align-center}
+
+![Token-count-nvme-throughput](/assets/images/zero_inference_token_count_nvme_throughput.png){: .align-center}
+
+## Using ZeRO-Inference
+We briefly discuss how users can determine when ZeRO-Inference is suitable for their application and how to enable ZeRO-Inference in DeepSpeed.
+
+### When to use ZeRO-Inference
+ZeRO-Inference is designed for inference applications that require GPU acceleration but lack sufficient GPU memory to host the model. Also, ZeRO-Inference is optimized for inference applications that are **throughput-oriented** and allow **large batch sizes**. Alternative techniques, such as [Accelerate](https://github.com/huggingface/accelerate), [DeepSpeed-Inference](https://www.deepspeed.ai/inference/), and [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) that fit the entire model into GPU memory, possibly using multiple GPUs, are more suitable for inference applications that are latency sensitive or have small batch sizes.
+
+### How to use ZeRO-Inference
+ZeRO-Inference is available in the DeepSpeed library versions >= 0.6.6. Integrating ZeRO-Inference into token generation pipelines, such as [Hugging Face generate](https://huggingface.co/docs/transformers/main_classes/text_generation), requires updating the DeepSpeed configuration to set [ZeRO optimization](https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training) to stage 3 and [parameter offloading](https://www.deepspeed.ai/docs/config-json/#parameter-offloading) to CPU or NVMe.
+
+Below is a configuration snippet for enabling ZeRO-Inference with offloading to CPU memory.
+```json
+    "zero_optimization": {
+        "stage": 3,
+        "offload_param": {
+            "device": "cpu",
+            ...
+        },
+        ...
+    }
+```
+
+Below is a configuration snippet for offloading to a NVMe device mounted on "/local_nvme".
+```json
+    "zero_optimization": {
+        "stage": 3,
+        "offload_param": {
+            "device": "nvme",
+            "nvme_path": "/local_nvme",
+            ...
+        },
+        ...
+    }
+```
+
+
+## Conclusion
+Recent advances in AI technology have primarily come from extreme scaling of model sizes. However, extreme model scaling has also made the hardware cost of training and inferencing prohibitive for all but the largest organizations, severely restricting access to AI innovations. To help democratize AI, we developed ZeRO-Inference, a technology that enables inference computations of massive models on as few as a single GPU. ZeRO-Inference reduces the GPU cost of SOTA model inference by hosting the model on CPU or NVMe memory and streaming the model layers into GPU memory for inference computation. ZeRO-Inference complements the democratization efforts of large organizations that publicly release pre-trained SOTA models by ensuring that inference computation of these models is affordable for most users (e.g., students, hobbyists, model scientists, etc.).
+
+
+## Acknowledgement
+The DeepSpeed team would like to acknowledge Stas Bekman for previewing this blog and providing valuable feedback.
diff --git a/docs/_posts/2022-10-11-mii.md b/docs/_posts/2022-10-11-mii.md
new file mode 100644
index 0000000000000000000000000000000000000000..8a39731759656687fafd7aaf76b9464bccca757d
--- /dev/null
+++ b/docs/_posts/2022-10-11-mii.md
@@ -0,0 +1,216 @@
+---
+title: "DeepSpeed-MII: instant speedup on 24,000+ open-source DL models with up to 40x cheaper inference"
+excerpt: ""
+date: 2022-10-11 00:09:00
+tags: inference
+---
+
+[ ![Text Generation Models](/assets/images/mii/hero.png) ](/assets/images/mii/hero.png){: .align-center}
+
+The Deep Learning (DL) open-source community has seen tremendous growth in the last few months. Incredibly powerful text generation models such as the Bloom 176B, or image generation models such as Stable Diffusion are now available to anyone with access to a handful or even a single GPU through platforms such as Hugging Face. While open-sourcing has democratized access to AI capabilities, their application is still restricted by two critical factors: 1) inference latency and 2) cost.
+
+There has been significant progress in system optimizations for DL model inference that can drastically reduce both latency and cost, but those are not easily accessible. The main reason for this limited accessibility is that the DL model inference landscape is diverse with models varying in size, architecture, system performance characteristics, hardware requirements, etc. Identifying the appropriate set of system optimizations applicable to a given model and applying them correctly is often beyond the scope of most data scientists, making low latency and low-cost inference mostly inaccessible.
+
+[DeepSpeed Model Implementations for Inference (MII)](https://github.com/microsoft/DeepSpeed-MII) is a new open-source python library from DeepSpeed, aimed towards making low-latency, low-cost inference of powerful models not only feasible but also easily accessible.
+
+* MII offers access to highly optimized implementations of **thousands of widely used DL models.**
+* MII supported models achieve significantly lower latency and cost compared to their original implementation.
+    + MII reduces the **latency of Big-Science Bloom 176B model by 5.7x**, while reducing the **cost by over 40x as shown in *Figures 2 (left) and 8***.
+    + MII reduces the latency and cost of deploying **Stable Diffusion by 1.9x as shown in *Figure 2 (right)***.
+* To enable low latency/cost inference, MII leverages an extensive set of optimizations from DeepSpeed-Inference such as *deepfusion* for transformers, automated *tensor-slicing* for multi-GPU inference, on-the-fly quantization with *ZeroQuant*, and several others (see below for more details).
+* With state-of-the-art performance, MII supports low-cost deployment of these models both on-premises and on Azure via AML with just a **few lines of codes**.
+
+# How does MII work?
+
+[ ![Text Generation Models](/assets/images/mii/mii-arch.png) ](/assets/images/mii/mii-arch.png)
+
+*Figure 1: MII Architecture, showing how MII automatically optimizes OSS models using DS-Inference before deploying them on-premises using GRPC, or on Microsoft Azure using AML Inference.*
+
+
+Under-the-hood MII is powered by [DeepSpeed-Inference](https://arxiv.org/abs/2207.00032). Based on the model type, model size, batch size, and available hardware resources, MII automatically applies the appropriate set of system optimizations from DeepSpeed-Inference to minimize latency and maximize throughput. It does so by using one of many pre-specified model injection policies, that allows MII and DeepSpeed-Inference to identify the underlying PyTorch model architecture and replace it with an optimized implementation (see *Figure 1*). In doing so, MII makes the expansive set of optimizations in DeepSpeed-Inference automatically available for thousands of popular models that it supports.
+
+# Supported Models and Tasks
+
+MII supports a growing list of tasks such as text generation, question-answering, text classification, etc, across thousands of transformer models available through multiple open-sourced model repositories such as Hugging Face, FairSeq, EluetherAI, etc. It supports dense models based on BERT, RoBERTa, GPT, OPT, and BLOOM architectures ranging from a few hundred million parameters in size to hundreds of billions of parameters in size. At the same time, it supports recent image generation models such as Stable Diffusion.
+
+See the MII GitHub repo for an up-to-date list of [models and tasks supported by MII](https://github.com/microsoft/deepspeed-mii#supported-models-and-tasks).
+
+# Inference Optimizations with MII
+
+Here we provide a summary of the expansive set of optimizations from DeepSpeed-inference made available via MII. For more details, please refer to \[[1](https://arxiv.org/abs/2207.00032), [2](https://arxiv.org/abs/2206.01861)\]:
+
+**DeepFusion for Transformers:** For transformer-based models such as Bert, Roberta, GPT-2, and GPT-J, MII leverages the transformer kernels in DeepSpeed-Inference that are optimized to achieve low latency at small batch sizes and high throughput at large batch sizes using DeepFusion.
+
+**Multi-GPU Inference with Tensor-Slicing:** For massive models such as Bloom 176B, MII automatically enables tensor-parallelism within a node to leverage aggregate memory bandwidth and compute across multiple GPUs to achieve the lowest latency and throughput compared to anything else that is currently available.
+
+**INT8 Inference with ZeroQuant:** For massive models with tens or hundreds of billions of parameters, MII supports INT8 Inference with ZeroQuant. Using this feature not only reduces the memory footprint and the number of GPUs required for inference but also increases the inference throughput by supporting larger batch sizes and using INT8 compute, thus lowering cost compared to FP16.
+
+**ZeRO-Inference for Resource Constrained Systems:** Models such as Bloom 176B, require over 176 GB of memory to just fit the model even with INT8 support. In the absence of the aggregate GPU memory across multiple GPUs required to deploy such models, MII enables [ZeRO-Inference](https://www.deepspeed.ai/2022/09/09/zero-inference.html) that can leverage the system CPU memory to deploy these massive models with a single GPU with limited memory.
+
+**Compiler Optimizations:** When applicable, MII automatically applies compiler-based optimizations via [TorchScript](https://pytorch.org/docs/stable/jit.html), [nvFuser](https://pytorch.org/blog/introducing-nvfuser-a-deep-learning-compiler-for-pytorch/), and [CUDA graph](https://developer.nvidia.com/blog/cuda-graphs/), in addition to the above optimizations, to further lower latency and improve throughput.
+
+# MII-Public and MII-Azure
+
+MII can work with two variations of DeepSpeed-Inference. The first, referred to as ds-public, contains most of the optimizations discussed above and is also available via our open-source DeepSpeed library. The second referred to as ds-azure, offers tighter integration with Azure, and is available via MII to all Microsoft Azure customers. We refer to MII running the two DeepSpeed-Inference variants as MII-Public and MII-Azure, respectively.
+
+Both MII-Public and MII-Azure offer significant latency and cost reduction compared to open-sourced PyTorch implementation (Baseline). However for certain generative workloads, they can have differentiated performance: MII-Azure provides further improvements beyond MII-Public. We quantify the latency and cost reduction for both variations in the next section.
+
+# Quantifying Latency and Cost Reduction
+
+Inference workloads can be either latency critical, where the primary objective is to minimize latency, or cost sensitive, where the primary objective is to minimize cost. In this section, we quantify the benefits of using MII for both latency-critical and cost-sensitive scenarios.
+
+## Latency Critical Scenarios
+
+For latency-critical scenarios, where a small batch size of 1 is often used, MII can reduce the latency by up to 6x for a wide range of open-source models, across multiple tasks. More specifically, we show model latency reduction of [^overhead_details]:
+
+1. Up to 5.7x for multi-GPU inference for text generation using massive models such as Big Science Bloom, Facebook OPT, and EluetherAI NeoX (*Figure 2 (left)*)
+
+2. Up to 1.9x for image generation tasks model using Stable Diffusion (*Figure 2 (right)*)
+
+3. Up to 3x for relatively smaller text generation models (up to 7B parameters) based on OPT, BLOOM, and GPT architectures, running on a single GPU (*Figures 3 and 4*)
+
+4. Up to 9x for various text representation tasks like fill-mask, text classification, question answering, and token classification using RoBERTa- and BERT- based models (*Figures 5 and 6*).
+
+[ ![multi gpu latency](/assets/images/mii/llm-latency-sd-latency.png) ](/assets/images/mii/llm-latency-sd-latency-zoom.png){: .align-center}
+*Figure 2: (left) Best achievable latency for large models. MII-Azure (int8) offers 5.7X lower latency compared to Baseline for Bloom-176B. (right) Stable Diffusion text to image generation latency comparison.*
+
+<!--![multi gpu latency](/assets/images/mii/multi-gpu-latency.png){: .align-center}
+*Figure B: Best achievable latency for large models. MII-Azure (int8) offers 5.7X lower latency compared to Baseline for Bloom-176B*
+
+![stable diffusion](/assets/images/mii/sd-latency.png){: .align-center}
+*Figure C: Stable Diffusion text to image generation latency comparison*-->
+
+[ ![OPT and BLOOM Models](/assets/images/mii/opt-bloom.png) ](/assets/images/mii/opt-bloom.png){: .align-center}
+*Figure 3: Latency comparison for OPT and BLOOM models. MII-Azure is up to 2.8x faster than baseline.*
+
+[ ![GPT Models](/assets/images/mii/gpt.png) ](/assets/images/mii/gpt.png){: .align-center}
+*Figure 4: Latency comparison for GPT models. MII-Azure is up to 3x faster than baseline.*
+
+[ ![Roberta Models](/assets/images/mii/roberta.png) ](/assets/images/mii/roberta.png){: .align-center}
+*Figure 5: Latency comparison for RoBERTa models. MII offers up to 9x lower model latency and up to 3x lower end-to-end latency than baseline on several tasks and RoBERTa variants [^overhead_details].*
+
+[ ![Bert Models](/assets/images/mii/bert.png) ](/assets/images/mii/bert.png){: .align-center}
+*Figure 6: Latency comparison for BERT models. MII offers up to 8.9x lower model latency and up to 4.5x end-to-end latency across several tasks and BERT variants[^overhead_details].*
+
+[^overhead_details]: The end-to-end latency of an inference workload is comprised of two components: i) actual model execution, and ii) pre-/post-processing before and after the model execution. MII optimizes the actual model execution but leaves the pre-/post-processing pipeline for future optimizations. We notice that text representation tasks have significant pre-/post-processing overhead (*Figures G and H*). We plan to address those in a future update.
+
+## Cost Sensitive Scenarios
+
+MII can significantly reduce the inference cost of very expensive language models like Bloom, OPT, etc. To get the lowest cost, we use a large batch size that maximizes throughput for both baseline and MII. Here we look at the cost reduction from MII using two different metrics: i) tokens generated per second per GPU, and ii) dollars per million tokens generated.
+
+*Figures 7 and 8* show that MII-Public offers over 10x throughput improvement and cost reduction compared to the baseline, respectively. Furthermore, MII-Azure offers over 30x improvement in throughput and cost compared to the baseline.
+
+[ ![tput large models](/assets/images/mii/tput-llms.png) ](/assets/images/mii/tput-llms.png){: .align-center}
+*Figure 7: Throughput comparison per A100-80GB GPU for large models. MII-Public offers over 15x throughput improvement while MII-Azure offers over 40x throughput improvement.*
+
+[ ![azure cost](/assets/images/mii/azure-cost.png) ](/assets/images/mii/azure-cost.png){: .align-center}
+*Figure 8: Cost of generating 1 million tokens on Azure with different model types. MII-Azure reduces the cost of generation by over 40x.*
+
+# Deployment Options
+
+MII supported models can be deployed in two different ways as shown in *Figure 1* with just a few lines of code.
+
+## MII-Public Deployment
+
+MII-Public can be deployed on-premises or on any cloud offering. MII creates a lightweight GRPC server to support this form of deployment and provides a GRPC inference endpoint for queries. The code below shows how a supported model can be deployed with MII-Public Deployment.
+
+```python
+import mii
+mii.deploy(task="text-to-image",
+           model="CompVis/stable-diffusion-v1-4",
+           deployment_name="sd-deployment")
+```
+
+## MII-Azure Deployment
+
+MII supports deployment on Azure via AML Inference. To enable this, MII generates AML deployment assets for a given model that can be deployed using the [Azure-CLI](https://learn.microsoft.com/en-us/cli/azure/what-is-azure-cli), as shown in the code below. Furthermore, deploying on Azure, allows MII to leverage DeepSpeed-Azure as its optimization backend, which offers better latency and cost reduction than DeepSpeed-Public.
+
+```python
+import mii
+mii.deploy(task="text-to-image",
+           model="CompVis/stable-diffusion-v1-4",
+           deployment_name="sd-deployment",
+           deployment_type=DeploymentType.AML)
+```
+
+To learn more about these deployment options and get started with MII, please the [MII getting started guide](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii).
+
+# Concluding Remarks
+
+We are very excited to share MII with the community and improve it with your feedback. We will continue to add support for more models in MII as well as enhance both MII-Public and MII-Azure for both on-premise and Azure users. Our hope is that while open sourcing has made powerful AI capabilities accessible to many, MII will allow for a wider infusion of these capabilities into a diverse set of applications and product offerings by instantly reducing the latency and cost of inferencing.
+
+# Appendix
+
+The table below shows the mapping between model aliases used in *Figures 3, 4, 5, and 6* and real model names.
+
+| Alias | Model Name |
+| --- | --- |
+| text-gen-m1 | [sberbank-ai/rugpt3large_based_on_gpt2](https://huggingface.co/sberbank-ai/rugpt3large_based_on_gpt2) |
+| text-gen-m2 | [skt/kogpt2-base-v2](https://huggingface.co/skt/kogpt2-base-v2) |
+| text-gen-m3 | [geralt/MechDistilGPT2](https://huggingface.co/geralt/MechDistilGPT2) |
+| text-gen-m4 | [mrm8488/distilgpt2-finetuned-wsb-tweets](https://huggingface.co/mrm8488/distilgpt2-finetuned-wsb-tweets) |
+| text-gen-m5 | [Norod78/hebrew-bad_wiki-gpt_neo-tiny](https://huggingface.co/Norod78/hebrew-bad_wiki-gpt_neo-tiny) |
+| text-gen-m6 | [shibing624/code-autocomplete-distilgpt2-python](https://huggingface.co/shibing624/code-autocomplete-distilgpt2-python) |
+| text-gen-m7 | [mrm8488/diltilgpt2-finetuned-bookcopus-10](https://huggingface.co/mrm8488/diltilgpt2-finetuned-bookcopus-10) |
+| bert-q&a-m1 | [bert-large-uncased-whole-word-masking-finetuned-squad](https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad) |
+| bert-q&a-m2 | [deepset/bert-large-uncased-whole-word-masking-squad2](https://huggingface.co/deepset/bert-large-uncased-whole-word-masking-squad2) |
+| bert-q&a-m3 | [nyust-eb210/braslab-bert-drcd-384](https://huggingface.co/nyust-eb210/braslab-bert-drcd-384) |
+| bert-q&a-m4 | [deepset/minilm-uncased-squad2](https://huggingface.co/deepset/minilm-uncased-squad2) |
+| bert-token-class-m1 | [dslim/bert-large-NER](https://huggingface.co/dslim/bert-large-NER) |
+| bert-token-class-m2 | [dbmdz/bert-large-cased-finetuned-conll03-english](https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english) |
+| bert-token-class-m3 | [dslim/bert-base-NER](https://huggingface.co/dslim/bert-base-NER) |
+| bert-token-class-m4 | [CAMeL-Lab/bert-base-arabic-camelbert-mix-ner](https://huggingface.co/CAMeL-Lab/bert-base-arabic-camelbert-mix-ner) |
+| bert-fill-mask-m1 | [bert-base-multilingual-cased](https://huggingface.co/bert-base-multilingual-cased) |
+| bert-fill-mask-m2 | [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased) |
+| bert-fill-mask-m3 | [wietsedv/bert-base-dutch-cased](https://huggingface.co/wietsedv/bert-base-dutch-cased) |
+| bert-fill-mask-m4 | [nlpaueb/bert-base-greek-uncased-v1](https://huggingface.co/nlpaueb/bert-base-greek-uncased-v1) |
+| bert-fill-mask-m5 | [dbmdz/bert-base-italian-xxl-cased](https://huggingface.co/dbmdz/bert-base-italian-xxl-cased) |
+| bert-fill-mask-m6 | [aubmindlab/bert-base-arabertv02](https://huggingface.co/aubmindlab/bert-base-arabertv02) |
+| bert-fill-mask-m7 | [dccuchile/bert-base-spanish-wwm-uncased](https://huggingface.co/dccuchile/bert-base-spanish-wwm-uncased) |
+| bert-fill-mask-m8 | [bert-base-german-cased](https://huggingface.co/bert-base-german-cased) |
+| bert-fill-mask-m9 | [bert-base-uncased](https://huggingface.co/bert-base-uncased) |
+| bert-fill-mask-m10 | [dbmdz/bert-base-german-cased](https://huggingface.co/dbmdz/bert-base-german-cased) |
+| bert-fill-mask-m11 | [nlpaueb/legal-bert-base-uncased](https://huggingface.co/nlpaueb/legal-bert-base-uncased) |
+| bert-fill-mask-m12 | [KB/bert-base-swedish-cased](https://huggingface.co/KB/bert-base-swedish-cased) |
+| bert-fill-mask-m13 | [indolem/indobertweet-base-uncased](https://huggingface.co/indolem/indobertweet-base-uncased) |
+| bert-fill-mask-m14 | [emilyalsentzer/Bio_ClinicalBERT](https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT) |
+| bert-fill-mask-m15 | [asafaya/bert-mini-arabic](https://huggingface.co/asafaya/bert-mini-arabic) |
+| bert-text-class-m1 | [DTAI-KULeuven/mbert-corona-tweets-belgium-topics](https://huggingface.co/DTAI-KULeuven/mbert-corona-tweets-belgium-topics) |
+| bert-text-class-m2 | [avichr/heBERT_sentiment_analysis](https://huggingface.co/avichr/heBERT_sentiment_analysis) |
+| bert-text-class-m3 | [finiteautomata/beto-sentiment-analysis](https://huggingface.co/finiteautomata/beto-sentiment-analysis) |
+| bert-text-class-m4 | [ProsusAI/finbert](https://huggingface.co/ProsusAI/finbert) |
+| bert-text-class-m5 | [cross-encoder/ms-marco-MiniLM-L-12-v2](https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-12-v2) |
+| bert-text-class-m6 | [nlptown/bert-base-multilingual-uncased-sentiment](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) |
+| bert-text-class-m7 | [microsoft/xtremedistil-l6-h256-uncased](https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased) |
+| bert-text-class-m8 | [cross-encoder/ms-marco-MiniLM-L-6-v2](https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-6-v2) |
+| fill-mask-m1 | [vinai/bertweet-large](https://huggingface.co/vinai/bertweet-large) |
+| fill-mask-m2 | [klue/roberta-large](https://huggingface.co/klue/roberta-large) |
+| fill-mask-m3 | [sberbank-ai/ruRoberta-large](https://huggingface.co/sberbank-ai/ruRoberta-large) |
+| q&a-m1 | [deepset/roberta-large-squad2](https://huggingface.co/deepset/roberta-large-squad2) |
+| token-class-m1 | [Jean-Baptiste/roberta-large-ner-english](https://huggingface.co/Jean-Baptiste/roberta-large-ner-english) |
+| text-class-m1 | [cross-encoder/stsb-roberta-large](https://huggingface.co/cross-encoder/stsb-roberta-large) |
+| text-class-m2 | [siebert/sentiment-roberta-large-english](https://huggingface.co/siebert/sentiment-roberta-large-english) |
+| text-class-m3 | [roberta-large-mnli](https://huggingface.co/roberta-large-mnli) |
+| fill-mask-m4 | [vinai/bertweet-base](https://huggingface.co/vinai/bertweet-base) |
+| fill-mask-m5 | [vinai/phobert-base](https://huggingface.co/vinai/phobert-base) |
+| fill-mask-m6 | [microsoft/graphcodebert-base](https://huggingface.co/microsoft/graphcodebert-base) |
+| fill-mask-m7 | [vinai/bertweet-covid19-base-uncased](https://huggingface.co/vinai/bertweet-covid19-base-uncased) |
+| fill-mask-m8 | [uklfr/gottbert-base](https://huggingface.co/uklfr/gottbert-base) |
+| fill-mask-m9 | [cardiffnlp/twitter-roberta-base](https://huggingface.co/cardiffnlp/twitter-roberta-base) |
+| fill-mask-m10 | [microsoft/codebert-base-mlm](https://huggingface.co/microsoft/codebert-base-mlm) |
+| fill-mask-m11 | [pdelobelle/robbert-v2-dutch-base](https://huggingface.co/pdelobelle/robbert-v2-dutch-base) |
+| fill-mask-m12 | [ufal/robeczech-base](https://huggingface.co/ufal/robeczech-base) |
+| q&a-m2 | [Rakib/roberta-base-on-cuad](https://huggingface.co/Rakib/roberta-base-on-cuad) |
+| q&a-m3 | [thatdramebaazguy/roberta-base-squad](https://huggingface.co/thatdramebaazguy/roberta-base-squad) |
+| text-class-m4 | [roberta-base-openai-detector](https://huggingface.co/roberta-base-openai-detector) |
+| text-class-m5 | [pysentimiento/robertuito-emotion-analysis](https://huggingface.co/pysentimiento/robertuito-emotion-analysis) |
+| text-class-m6 | [cardiffnlp/twitter-roberta-base-sentiment](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment) |
+| text-class-m7 | [cardiffnlp/twitter-roberta-base-sentiment-latest](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest) |
+| q&a-m4 | [deepset/roberta-base-squad2](https://huggingface.co/deepset/roberta-base-squad2) |
+| text-class-m8 | [textattack/roberta-base-SST-2](https://huggingface.co/textattack/roberta-base-SST-2) |
+| text-class-m9 | [cardiffnlp/twitter-roberta-base-emotion](https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion) |
+| text-class-m10 | [pysentimiento/robertuito-sentiment-analysis](https://huggingface.co/pysentimiento/robertuito-sentiment-analysis) |
+| text-class-m11 | [finiteautomata/bertweet-base-sentiment-analysis](https://huggingface.co/finiteautomata/bertweet-base-sentiment-analysis) |
+| fill-mask-m13 | [huggingface/CodeBERTa-small-v1](https://huggingface.co/huggingface/CodeBERTa-small-v1) |
+| q&a-m5 | [deepset/tinyroberta-squad2](https://huggingface.co/deepset/tinyroberta-squad2) |
+| text-class-m12 | [j-hartmann/emotion-english-distilroberta-base](https://huggingface.co/j-hartmann/emotion-english-distilroberta-base) |
diff --git a/docs/_posts/2022-12-12-data-efficiency.md b/docs/_posts/2022-12-12-data-efficiency.md
new file mode 100644
index 0000000000000000000000000000000000000000..3b6adb4d7dab957bd45437da8b00030945bde708
--- /dev/null
+++ b/docs/_posts/2022-12-12-data-efficiency.md
@@ -0,0 +1,144 @@
+---
+title: "DeepSpeed Data Efficiency: A composable library that makes better use of data, increases training efficiency, and improves model quality"
+excerpt: ""
+date: 2022-12-12 00:09:00
+tags: training
+---
+
+[ ![DeepSpeed Data Efficiency](/assets/images/data_efficiency/data_efficiecy_fig0.png) ](/assets/images/data_efficiency/data_efficiecy_fig0.png){: .align-center}
+
+Recently, large-scale deep learning models are empowering us to achieve more in many ways, such as [improving programming efficiency by code generation](https://github.com/features/copilot) and [providing art inspiration by text-to-image generation](https://www.microsoft.com/en-us/microsoft-365/blog/2022/10/12/new-tools-from-microsoft-365-edge-and-bing-bridge-the-gap-between-productivity-and-creativity/). To enable these services and keep improving the quality, deep learning model architecture evolves rapidly, and the model size is also growing at a tremendous speed. For example, from GPT to GPT-3 the model size increased 1500x in 2 years. The increasing model size leads to unprecedented training cost, making it challenging for many AI practitioners to train their own models. On the other hand, a less-emphasized perspective is that **data scale is actually increasing at a similar speed as model scale, and the training cost is proportional to both of them.** In Figure 1 below we plot the model and data scales of several representative language models in the last 5 years. From the oldest model on the left to the newest models on the right, both the model and data scales increase at similar speed. This demonstrates the importance of improving data efficiency: achieve same model quality with less data and reduced training cost, or achieve better model quality with the same amount of data and similar training cost.
+
+[ ![Model and data scales](/assets/images/data_efficiency/data_efficiecy_fig1.png) ](/assets/images/data_efficiency/data_efficiecy_fig1.png){: .align-center}
+
+*Figure 1: Model scale (number of parameters) and data scale (number of tokens consumed during training) of representative language models in the last 5 years.*
+
+There are two popular research directions among existing data efficiency techniques: Data sampling techniques aim to improve the convergence speed by sampling the most suitable next data batch from the whole data pool; Data routing techniques aim to reduce the computation by routing each data to only a subset of the model components. These techniques improve data and training efficiency, but existing solutions on them have limitations on **extensibility, flexibility, and composability.**  They are commonly designed for specific training tasks, making them hard to be extended with customized strategies and making them less flexible to be applied on diverse workloads from different users. Furthermore, different techniques are implemented separately, making it challenging to compose multiple solutions to further improve data and training efficiency.
+
+To address these challenges, we, the DeepSpeed team as part of Microsoft’s [AI at Scale](https://www.microsoft.com/en-us/research/project/ai-at-scale/) initiative, are proud to announce **DeepSpeed Data Efficiency Library** – a composable framework that makes better use of data, increases training efficiency, and improves model quality. DeepSpeed Data Efficiency takes extensibility, flexibility, and composability into consideration, and it specifically demonstrates the following innovations:
+
+**Efficient data sampling via curriculum learning.** Curriculum learning (CL) improves data efficiency by sampling from easier data. We present a general curriculum learning library which enables users to employ curriculum learning to their models at **maximum extensibility**: users can easily analyze, index, and sample their training data based on various customizable strategies. Using this library, we were able to explore different CL strategies for GPT-3 and BERT pretraining and identify the best solution that provides up to **1.5x data saving** while still maintaining similar model quality.
+
+**Efficient data routing via random layerwise token dropping.** We present a novel data routing technique called random layerwise token dropping (random-LTD) to skip the computation of a subset of the input tokens at all middle layers. Random-LTD employs a simple yet effective routing strategy and requires **minimal model architecture change.** It is **flexible** to apply random-LTD to various tasks (GPT-3/BERT pretraining and GPT/ViT finetuning), and we achieve great data efficiency improvement (up to **1.5x data saving** while still maintaining the model quality).
+
+**Seamlessly composing multiple methods.** The proposed DeepSpeed Data Efficiency framework seamlessly composes the curriculum learning and random-LTD techniques, and only requires minimal changes on the user code side. Furthermore, by composing both methods we can achieve even better data and training efficiency: for GPT-3 1.3B pretraining, we achieve **2x data and 2x time savings** together with better or similar model quality compared to the baseline training. When using the same amount of data, our approach further improves the model quality over the baseline. Users can also extend and contribute to the library by adding additional data efficiency techniques to compose together.
+
+Each of these advances is explored further in the blog post below. For more about the technical details, please read our papers, “[Random-LTD: Random and Layerwise Token Dropping Brings Efficient Training for Large-scale Transformers](https://arxiv.org/abs/2211.11586)” which describes the random-LTD technique, and “[DeepSpeed Data Efficiency: Improving Deep Learning Model Quality and Training Efficiency via Efficient Data Sampling and Routing](https://arxiv.org/abs/2212.03597)” which describes the curriculum learning technique and overall DeepSpeed Data Efficiency framework.
+
+# Efficient Data Sampling via Curriculum Learning
+
+## Motivation
+
+Curriculum learning aims to improve training convergence speed by presenting relatively easier or simpler examples earlier during training. Building a curriculum learning solution usually requires two components: the difficulty metric (i.e., how to quantify the difficulty of each data sample) and the pacing function (i.e., how to decide the curriculum difficulty range when sampling next training data batch). Curriculum learning has been successfully applied to various training tasks, and last year we also released a specific curriculum learning technique (sequence length warmup) for GPT-style model pretraining (see technical details in our paper “[The Stability-Efficiency Dilemma: Investigating Sequence Length Warmup for Training GPT Models](https://openreview.net/forum?id=JpZ5du_Kdh)” published in NeurIPS 2022). However, one common limitation among existing works is that there does not exist a generalized and extensible curriculum learning library, which allows practitioners to easily apply custom curriculum difficulty metrics, the combination of metrics, and pacing functions.
+
+## Design
+
+To solve the limitation of existing solutions, we design and implement a general curriculum learning library emphasizing the extensibility. It consists of three components as shown in Figure 2 below (top part). First, we use a data analyzer to perform the offline CPU-only data analysis which indexes the whole data pool based on any difficulty metric such as the sequence length, the vocabulary rarity, or anything defined by user. Next, during training, the curriculum scheduler determines the difficulty threshold for the current step based on a pacing function such as linear, rooted, or any strategy provided by users. Then the data sampler will sample the data with desired difficulty from the indexed data pool. Overall, this general implementation would enable users to explore curriculum learning on their workloads with maximum customizability (more technical details in [our DeepSpeed Data Efficiency paper](https://arxiv.org/abs/2212.03597)).
+
+[ ![DeepSpeed Data Efficiency framework](/assets/images/data_efficiency/data_efficiecy_fig2.png) ](/assets/images/data_efficiency/data_efficiecy_fig2.png){: .align-center}
+
+*Figure 2: Design of the DeepSpeed Data Efficiency framework.*
+
+## Evaluation Results
+
+Using this general and extensible curriculum learning solution for GPT-3 and BERT-Large model pretraining, we are able to easily analyze and index the huge training data based on up to 7 difficulty metrics and enable better data and training efficiency. For GPT-3 pretraining, our solution with the best difficulty metric (combination of truncation-based sequence length and vocabulary rarity) achieves 1.5x data and training cost saving while still maintaining model quality as baseline (Table 1 Case (8) vs. (1)). For BERT-Large pretraining, our solution with the best difficulty metric (vocabulary rarity) achieves 1.5x saving while still maintaining model quality (Table 2 Case (8) vs. (1)). On the other hand, our solutions can further improve model quality when using the same amount of data as baseline (Table 1 Case (2) to (6), Table 2 Case (2) to (6)).
+
+| **Case** | **Pretrain data** | **Avg 0-shot accuracy** | **Avg 10-shot accuracy** |
+| ---------- |---------- |---------- |---------- |
+| (1) Baseline | 300B | 42.5 | 44.0 |
+| (2) CL truncation-based sequence length | 300B | 43.4 | 44.8 |
+| (3) CL reshape-based sequence length | 300B | 43.0 | 44.5 |
+| (4) CL vocabulary rarity | 300B | 42.3 | 44.5 |
+| (5) CL combining (2) and (4) | 300B | **43.6** | **44.9** |
+| (6) CL combining (3) and (4) | 300B | 43.0 | 44.4 |
+| (7) Baseline | 200B (1.5x) | 41.9 | 44.0 |
+| (8) CL combining (2) and (4) | **200B (1.5x)** | 42.7 | 44.5 |
+
+*Table 1: GPT-3 1.3B pretraining data consumption and average evaluation accuracy on 19 tasks.*
+
+| **Case** | **Pretrain data** | **GLUE finetune score** |
+| ---------- |---------- |---------- |
+| (1) Baseline | 1049B | 87.29 |
+| (2) CL truncation-based sequence length | 1049B | 87.31 |
+| (3) CL reorder-based sequence length | 1049B | 87.48 |
+| (4) CL vocabulary rarity | 1049B | 87.36 |
+| (5) CL combining (2) and (4) | 1049B | **87.60** |
+| (6) CL combining (3) and (4) | 1049B | 87.06 |
+| (7) Baseline | 703B (1.5x) | 87.19 |
+| (8) CL combining (2) and (4) | **703B (1.5x)** | 87.29 |
+
+*Table 2: BERT-Large pretraining data consumption and average GLUE finetuning score on 8 tasks.*
+
+# Efficient Data Routing via Random Layerwise Token Dropping
+
+## Motivation
+
+Standard data routing usually feeds the full images/sequences into all layers of a model. However, this process may not be optimal for training efficiency since some parts of an image (or words of a sentence) do not require a frequent feature update. As such, the token dropping method has been proposed, which is illustrated in Figure 3 (b) below, to skip the compute of some tokens/words (i.e., G-2 tokens in Figure 3 (b)) of a sentence in order to save the compute cost.
+
+Although existing methods show promising results, they also exhibit several caveats: (1) most works solely focus on BERT (encoder-only on text data) pretraining and do not include decoder pretraining and/or other modalities (e.g., images); (2) the ability to skip layers is limited, which bounds the total amount of compute saving. By analyzing existing methods, we found out the potential main issue that limits their skipping and coverage abilities is the loss of attention mechanism for G-2 tokens for all skipped layers, since multi-head attention focuses on different tokens at different layer depths and the attention map aligns with the dependency relation most strongly in the middle of transformer architectures.
+
+## Design
+
+To resolve this main issue, we propose random-LTD, a **random** and **layerwise** token dropping mechanism, which processes only a subset of tokens among the entire data batch for all middle layers in order to save compute cost (see more details in [our Random-LTD paper](https://arxiv.org/abs/2211.11586)). As such, each token rarely bypasses all middle layers and its dependency with other tokens can be captured by the model. The illustration of random-LTD compared to baseline is shown in Figure 3 below, where random-LTD splits the input tokens into two groups and only the first group involves the compute.
+
+[ ![random-LTD](/assets/images/data_efficiency/data_efficiecy_fig3.png) ](/assets/images/data_efficiency/data_efficiecy_fig3.png){: .align-center}
+
+*Figure 3: Comparison between baseline, existing token dropping methods, and random-LTD. Note that for random-LTD, only part of the inputs (Group 1) is used for Layer i.*
+
+Random-LTD is simple yet very effective. Particularly, compared to other existing token dropping methods, random-LTD (1) does a purely random selection for each layer for two different groups, as such we do not require any expert design for the selection criterion; (2) is able to apply to all middle layers to achieve better saving ratio; (3) demonstrates great generalizability for both encoder and decoder models; and (4) is easy to use without much modeling change. These advantages enable maximum flexibility when applying random-LTD to various workloads.
+
+## Evaluation Results
+
+Thanks to its great flexibility, we were able to apply random-LTD method to broader applications, including BERT and GPT pretraining as well as ViT and GPT finetuning tasks. For all cases, random-LTD achieves similar model quality as baseline while using less data, and/or achieve better model quality while using the same amount of data (Table 3 to 6). For GPT-3 and BERT-Large pretraining, random-LTD achieves 1.5-2x data saving while still maintaining the same model quality. For GPT-3 we also tested random-LTD with full data which further improves the model quality compared to baseline.
+
+| **Case** | **Pretrain data** | **Avg 0-shot accuracy** |
+| ---------- |---------- |---------- |
+| (1) Baseline | 300B | 42.5 |
+| (2) Random-LTD | 300B | **43.7** |
+| (3) Random-LTD | **200B (1.5x)** | 42.5 |
+
+*Table 3: GPT-3 1.3B pretraining data consumption and average evaluation accuracy on 19 tasks.*
+
+| **Case** | **Pretrain data** | **GLUE finetune score** |
+| ---------- |---------- |---------- |
+| (1) Baseline | 1049B | 87.29 |
+| (2) Random-LTD | **524B (2x)** | **87.32** |
+
+*Table 4: BERT-Large pretraining data consumption and average GLUE finetuning score on 8 tasks.*
+
+| **Case** | **Train data** | **ImageNet Top-1 Acc** |
+| ---------- |---------- |---------- |
+| (1) Baseline | 100% | 84.65 |
+| (2) Random-LTD | **77.7% (1.3x)** | **84.70** |
+
+*Table 5: Finetuning result of ViT on ImageNet.*
+
+| **Case** | **Train data** | **PTB PPL** |
+| ---------- |---------- |---------- |
+| (1) Baseline | 100% | 16.11 |
+| (2) Random-LTD | 100% | **15.9** |
+
+*Table 6: GPT-2 350M finetuning result on the PTB task.*
+
+# Composing Data Efficiency Techniques to Achieve More
+
+The curriculum learning and random-LTD techniques are complementary. Inside DeepSpeed Data Efficiency framework, we seamlessly compose the two techniques as shown in Figure 2 above, where curriculum learning helps to sample the next data batch and random-LTD helps to decide how to route each sampled data inside the model. DeepSpeed Data Efficiency solves several complexities when composing the two techniques so that users can easily apply each technique or both to their training pipeline. The composability of DeepSpeed Data Efficiency also applies to data sampling and routing techniques in general, so that it provides a platform to implement and compose additional data efficiency techniques.
+
+The composed DeepSpeed Data Efficiency solution leverages both data efficiency techniques and achieves even better data and training efficiency. Take the GPT-3 pretraining task as an example, composing CL and random-LTD, with 100% data, leads to the best model quality in our experiments (Table 7 Case (1) to (4)). When pretraining with 50% data, the baseline training results in worse zero-shot and 10-shot evaluation accuracy, and using either CL or random-LTD can only recover part of the 10-shot accuracy loss. On the other hand, the composed data efficiency solution achieves the same or better accuracy results as baseline with 100% data, demonstrating a 2x data and 2x time saving (Case (5) to (8)). Similar benefit such as 2x data saving was also observed when applying our solution to BERT pretraining.
+
+| **Case** | **Pretrain data** | **Pretrain time (on 64 V100)** | **Avg 0-shot accuracy** | **Avg 10-shot accuracy** |
+| ---------- |---------- |---------- |---------- |---------- |
+| (1) Baseline | 300B | 260hr | 42.5 | 44.0 |
+| (2) CL best metric | 300B | 259hr | 43.6 | 44.9 |
+| (3) random-LTD | 300B | 263hr | 43.7 | 44.9 |
+| (4) CL + random-LTD | 300B | 260hr | **43.8** | **45.1** |
+| (5) Baseline | 150B (2x) | 130hr (2x) | 42.0 | 42.7 |
+| (6) CL best metric | 150B (2x) | 129hr (2x) | 42.6 | 43.7 |
+| (7) random-LTD | 150B (2x) | 131hr (2x) | 42.7 | 43.5 |
+| (8) CL + random-LTD | **150B (2x)** | **130hr (2x)** | 42.8 | 44.0 |
+
+*Table 7: GPT-3 1.3B pretraining data/time consumption and average evaluation accuracy on 19 tasks.*
+
+# Concluding Remarks
+
+We are very excited to share DeepSpeed Data Efficiency library with the community and improve it with your feedback. Please find the code, tutorial, and documents at the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed), and [website](/tutorials/data-efficiency/). And for more technical details please read our [Random-LTD paper](https://arxiv.org/abs/2211.11586) and [DeepSpeed Data Efficiency paper](https://arxiv.org/abs/2212.03597). We believe that our composable library and novel data efficiency techniques will help users reduce training cost while maintaining model quality or achieve better quality under similar cost. And we hope DeepSpeed Data Efficiency could become a platform that motivates and accelerates future research on deep learning data efficiency.
diff --git a/docs/_tutorials/advanced-install.md b/docs/_tutorials/advanced-install.md
old mode 100644
new mode 100755
diff --git a/docs/_tutorials/automatic-tensor-parallelism.md b/docs/_tutorials/automatic-tensor-parallelism.md
new file mode 100644
index 0000000000000000000000000000000000000000..6991d5caf92574c621c2f20758764f76b98df88a
--- /dev/null
+++ b/docs/_tutorials/automatic-tensor-parallelism.md
@@ -0,0 +1,154 @@
+---
+title: "Automatic Tensor Parallelism for HuggingFace Models"
+tags: inference
+---
+
+# Contents
+   * [Introduction](#introduction)
+   * [Example Script](#example-script)
+        * [Launching](#launching)
+        * [OPT 13B Inference Performance Comparison](#opt-13b-inference-performance-comparison)
+   * [Supported Models](#supported-models)
+   * [Unsupported Models](#unsupported-models)
+
+# Introduction
+This tutorial demonstrates the new automatic tensor parallelism feature for inference. Previously, the user needed to provide an injection policy to DeepSpeed to enable tensor parallelism. DeepSpeed now supports automatic tensor parallelism for HuggingFace models by default as long as kernel injection is not enabled and an injection policy is not provided. This allows our users to improve performance of models that are not currently supported via kernel injection, without providing the injection policy. Below is an example of the new method:
+
+```python
+# ---------------------------------------
+# New automatic tensor parallelism method
+# ---------------------------------------
+import os
+import torch
+import transformers
+import deepspeed
+local_rank = int(os.getenv("LOCAL_RANK", "0"))
+world_size = int(os.getenv("WORLD_SIZE", "1"))
+# create the model pipeline
+pipe = transformers.pipeline(task="text2text-generation", model="google/t5-v1_1-small", device=local_rank)
+# Initialize the DeepSpeed-Inference engine
+pipe.model = deepspeed.init_inference(
+    pipe.model,
+    mp_size=world_size,
+    dtype=torch.float
+)
+output = pipe('Input String')
+```
+
+Previously, to run inference with only tensor parallelism for the models that don't have kernel injection support, you could pass an injection policy that showed the two specific linear layers on a Transformer Encoder/Decoder layer: 1) the attention output GeMM and 2) layer output GeMM. We needed these parts of the layer to add the required all-reduce communication between GPUs to merge the partial results across model-parallel ranks. Below, we show an example of this previous method:
+
+```python
+# ----------------------------------
+# Previous tensor parallelism method
+# ----------------------------------
+import os
+import torch
+import transformers
+import deepspeed
+from transformers.models.t5.modeling_t5 import T5Block
+local_rank = int(os.getenv("LOCAL_RANK", "0"))
+world_size = int(os.getenv("WORLD_SIZE", "1"))
+# create the model pipeline
+pipe = transformers.pipeline(task="text2text-generation", model="google/t5-v1_1-small", device=local_rank)
+# Initialize the DeepSpeed-Inference engine
+pipe.model = deepspeed.init_inference(
+    pipe.model,
+    mp_size=world_size,
+    dtype=torch.float,
+    injection_policy={T5Block: ('SelfAttention.o', 'EncDecAttention.o', 'DenseReluDense.wo')}
+)
+output = pipe('Input String')
+```
+
+With automatic tensor parallelism, we do not need to provide the injection policy for supported models. The injection policy will be determined at runtime and applied automatically.
+
+
+# Example Script
+
+We can observe performance improvement with automatic tensor parallelism using the [inference test suite](https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/text-generation/inference-test.py). The script includes per token latency, bandwidth, throughput and memory checks for comparison. See the [README](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/huggingface/text-generation#deepspeed-huggingface-text-generation-examples) for more information.
+
+
+## Launching
+
+Use the following command to run without DeepSpeed and without tensor parallelism. Set the `test_performance` flag to collect performance data:
+
+```bash
+deepspeed --num_gpus <num_gpus> DeepSpeedExamples/inference/huggingface/text-generation/inference-test.py --name <model> --batch_size <batch_size> --test_performance
+```
+
+
+To enable tensor parallelism, you need to use the flag `ds_inference` for the compatible models:
+
+```bash
+deepspeed --num_gpus <num_gpus> DeepSpeedExamples/inference/huggingface/text-generation/inference-test.py --name <model> --batch_size <batch_size> --test_performance --ds_inference
+```
+
+## OPT 13B Inference Performance Comparison
+
+The following results were collected using V100 SXM2 32GB GPUs.
+
+### Max New Tokens = 50
+
+| Test       | Memory Allocated per GPU   | Max Batch Size   | Max Throughput per GPU   |
+| ---------- | -------------------------- | ---------------- | ------------------------ |
+| No TP      | 23.94 GB                   | 64               | 18.84 TFlops             |
+| 2 GPU TP   | 12.23 GB                   | 320              | 27.17 TFlops             |
+| 4 GPU TP   | 6.36 GB                    | 664              | 27.63 TFlops             |
+
+### Max New Tokens = 1024
+
+| Test       | Memory Allocated per GPU   | Max Batch Size   | Max Throughput per GPU   |
+| ---------- | -------------------------- | ---------------- | ------------------------ |
+| No TP      | 23.94 GB                   | 2                | 1.65 TFlops              |
+| 2 GPU TP   | 12.23 GB                   | 20               | 4.61 TFlops              |
+| 4 GPU TP   | 6.36 GB                    | 56               | 4.90 TFlops              |
+
+# Supported Models
+
+The following model families have been successfully tested with automatic tensor parallelism. Other models may work but have not been tested yet.
+
+- albert
+- bert
+- bigbird_pegasus
+- camembert
+- deberta_v2
+- electra
+- ernie
+- esm
+- gpt-j
+- gpt-neo
+- gpt-neox
+- longt5
+- luke
+- m2m_100
+- marian
+- mvp
+- nezha
+- openai
+- opt
+- pegasus
+- perceiver
+- plbart
+- reformer
+- roberta
+- roformer
+- splinter
+- t5
+- xglm
+- xlm_roberta
+- yoso
+
+# Unsupported Models
+
+The following models are not currently supported with automatic tensor parallelism. They may still be compatible with other DeepSpeed features (e.g., kernel injection for Bloom):
+
+- bloom
+- codegen
+- deberta
+- flaubert
+- fsmt
+- gpt2
+- led
+- longformer
+- xlm
+- xlnet
diff --git a/docs/_tutorials/autotuning.md b/docs/_tutorials/autotuning.md
index 303087d298a7bb9a7d8d88ae3122b129b8f61a1e..38648daa89f246f2b5dc3485336e3391e1b4ab00 100644
--- a/docs/_tutorials/autotuning.md
+++ b/docs/_tutorials/autotuning.md
@@ -120,3 +120,7 @@ Note that the performance metric used in autotuning is calculated using the timi
 Tuning completed in 0:27:33.988447. Total number of experiments: 13.
 
 As we can see the DeepSpeed Autotuner can select a better than hand-tuned configuration with a reasonable number of experiments. Examples in [Autotuning Hugging Face Examples](https://github.com/microsoft/DeepSpeedExamples/tree/master/autotuning/hf#autotuning-hugging-face-examples) would demonstrate the effectiveness of autotuning across different models.
+
+### DeepSpeed Autotuning with AzureML
+
+To try DeepSpeed autotuning with AzureML, please see the example [here](https://github.com/Azure/azureml-examples/tree/main/cli/jobs/deepspeed/deepspeed-autotuning).
diff --git a/docs/_tutorials/azure.md b/docs/_tutorials/azure.md
index 1016aeafd007c2257a24e54e735e6ec923869106..6c7cded7b27c0bd3ce1f863d30c85b2e0a5bbcdc 100644
--- a/docs/_tutorials/azure.md
+++ b/docs/_tutorials/azure.md
@@ -3,132 +3,20 @@ title: "Getting Started with DeepSpeed on Azure"
 tags: getting-started
 ---
 
-This tutorial will help you get started running DeepSpeed on [Azure virtual
-machines](https://azure.microsoft.com/en-us/services/virtual-machines/).
-Looking forward, we will be integrating these techniques and additional enhancements
-into the [Azure ML](https://azure.microsoft.com/en-us/services/machine-learning/) platform to
-benefit all your large model training jobs.
+This tutorial will help you get started with DeepSpeed on Azure.
 
 If you don't already have an Azure account please see more details here: [https://azure.microsoft.com/](https://azure.microsoft.com/).
 
-To use DeepSpeed on [Azure ML](https://azure.microsoft.com/en-us/services/machine-learning/), please take a look at easy-to-use examples for Transformers and CIFAR training from [AzureML Examples GitHub](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed).
+# DeepSpeed on Azure via AzureML
 
-To help with launching Azure instances we suggest using the [Azure
-CLI](https://docs.microsoft.com/en-us/cli/azure/?view=azure-cli-latest). We have created
-several helper scripts to get you quickly started using DeepSpeed with Azure.
- * Install Azure CLI on your local box: [https://docs.microsoft.com/en-us/cli/azure/install-azure-cli](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli).
- * Alternatively, you can use the Azure in-browser shell: [https://shell.azure.com/](https://shell.azure.com/).
+The recommended and simplest method to try DeepSpeed on Azure is through [AzureML](https://azure.microsoft.com/en-us/services/machine-learning/). A training example and a DeepSpeed autotuning example using AzureML v2 can be found [here](https://github.com/Azure/azureml-examples/tree/main/cli/jobs/deepspeed).
 
-## Create an SSH key
-Generate an SSH key that will be used across this tutorial to SSH into your VMs and
-between Docker containers. `ssh-keygen` is the recommended way of doing this. Our scripts
-assume your key is located inside the same directory as the Azure scripts.
+For AzureML v1 examples, please take a look at easy-to-use examples for Megatron-DeepSpeed, Transformers and CIFAR training [here](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk/workflows/train/deepspeed).
 
-## Azure Config JSON
-Our helper scripts depend on the following a configuration JSON for deployment
-and setup.  We have provided a simple example JSON in `azure_config.json` that
-sets up a basic environment with two VMs. This config uses the NV6_Promo
-instance type which has one NVIDIA Tesla M60 GPU per VM. You can read more
-details about the VM on the [Linux Virtual Machines
-Pricing](https://azure.microsoft.com/en-us/pricing/details/virtual-machines/linux/)
-page.
+> Our [Megatron-DeepSpeed](https://github.com/microsoft/megatron-deepspeed) contains the most up to date [recipe](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azureml) for end-to-end training on AzureML.
 
-See the example below:
- ```json
-{
-  "num_vms": 2,
-  "location": "southcentralus",
-  "azure_sku": "Standard_NV6_Promo",
-  "ssh_private_key": "id_rsa",
-  "docker_ssh_port": 2222
-}
-```
+# DeepSpeed on Azure VMs
 
-## Dependencies
-The scripts in this tutorial require [jq](https://stedolan.github.io/jq/) to help with
-parsing JSON from the command line. Also it is recommended to install
-[pdsh](https://linux.die.net/man/1/pdsh) to help launch ssh connections in parallel.
+If you don't have access to AzureML or if want to build a custom environments using [Azure virtual machines](https://azure.microsoft.com/en-us/services/virtual-machines/) or Azure VM Scale-Sets ([VMSS](https://docs.microsoft.com/en-us/azure/virtual-machine-scale-sets/overview)), we are working on easy-to-use cluster setup scripts that will be published in the next few weeks.
 
-## Create Azure VMs
-We first need to allocate the VMs. We provide a script
-```bash
-./create_vms.sh
-```
-to create VMs with the Azure SKU in the region specified in `azure_config.json`. Feel
-free to customize your JSON to your desired region/SKU. This step will take a few minutes
-to complete while it sets up all of your VMs on Azure.
-
-## Setup VM environment to use DeepSpeed
-Next, we need to configure the VM environment for DeepSpeed. We provide a script
-```bash
-./setup_vms.sh
-```
-to generate a [hostfile](/getting-started/#resource-configuration-multi-node) and SSH
-configuration on all of the VMs. This configuration will be used by the DeepSpeed
-Docker containers in the next step.
-
-## Start the DeepSpeed docker container
-We now setup the DeepSpeed Docker containers on the VMs. We provide a script
-```bash
-./setup_docker.sh
-```
-to pull the DeepSpeed image onto all VMs and start a container instance in the
-background. This will take several minutes since it needs to pull the entire Docker
-image.
-
-## Access VMs
-The tool `azure_ssh.sh` will let you SSH into any of the VMs with this
-syntax:
-```bash
-./azure_ssh.sh <node-id> [command]
-```
-where the `node-id` is a number between `0` and `num_vms-1`.  This script will find the
-public IP address of your VM and use the SSH key provided in the Azure configuration
-JSON.
-
-## Access DeepSpeed container
-Everything should be up and running at this point. Let's access the running DeepSpeed
-container on the first VM and make sure we can talk to the other containers in our deployment.
-
- * SSH into the first VM via: `./azure_ssh.sh 0`
- * Change directories into the azure folder of this repo via: `cd ~/workdir/DeepSpeed/azure`
- * Attach the running docker container via: `./attach.sh`
- * You should now be able to `ssh` into any other docker container, the containers can be
-   accessed via their SSH alias of `worker-N`, where `N` is the VM number between `0`
-   and `num_vms-1`. In this example we should be able to successfully run `ssh worker-1
-   hostname` which will return the hostname of worker-1.
-
-## Parallel SSH across containers
- DeepSpeed comes installed with a helper script `ds_ssh` which is a wrapper around
- the [pdsh](https://linux.die.net/man/1/pdsh) command that lets you issue commands
- to groups of hosts (via SSH) in parallel. This wrapper simply connects with the
- hostfile that defines all the containers in your deployment. For example if you run
- `ds_ssh hostname` you should see a list of all the hostnames in your deployment.
-
-## Run CIFAR-10 example model
-We will now run the DeepSpeed CIFAR-10 model example to test the VM setup. From inside
-the first DeepSpeed container:
-
-  1) Install the python dependencies necessary to run the CIFAR-10 example model. You can
-  do this across your cluster via:
-  ```bash
-  ds_ssh pip install -r ~/workdir/DeepSpeed/DeepSpeedExamples/cifar/requirements.txt
-  ```
-
-  2) Now change directories to the CIFAR example:
-  ```bash
-  cd ~/workdir/DeepSpeed/DeepSpeedExamples/cifar
-  ```
-
-  3) Finally, launch training across all VMs:
-  ```bash
-  deepspeed cifar10_deepspeed.py --deepspeed --deepspeed_config ds_config.json
-  ```
-
-## Megatron-LM GPT2
-DeepSpeed includes an example model using Megatron-LM's GPT2. Please refer to the full
-[Megatron tutorial](/tutorials/megatron/) for more details.
- * In order to fully train GPT2 with DeepSpeed and ZeRO we recommend using 8 instances of
-   Azure's Standard_ND40rs_v2 SKU for a total of 64 NVIDIA V100 GPUs. With this setup and
-   a batch size of 1536 you should be able to complete 100k training steps (153.6 million
-   samples) in less than 2 weeks of training.
+If you already have a cluster setup, you can use the [azure recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azure) that can easily be modified to train various model configurations.
diff --git a/docs/_tutorials/bert-finetuning.md b/docs/_tutorials/bert-finetuning.md
old mode 100644
new mode 100755
diff --git a/docs/_tutorials/bert-pretraining.md b/docs/_tutorials/bert-pretraining.md
old mode 100644
new mode 100755
index e3771b7fdad23179bc4c60032791c0e8a87a258a..a0943949f9bc57d353f814e154f9ae649b2570d8
--- a/docs/_tutorials/bert-pretraining.md
+++ b/docs/_tutorials/bert-pretraining.md
@@ -4,6 +4,10 @@ excerpt: ""
 tags: training pre-training
 ---
 
+**Note:**
+On 08/15/2022 we have added another BERT pre-training/fine-tuning example at [github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/bert_with_pile](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/bert_with_pile), which includes a README.md that describes how to use it. Compared to the example described below, the new example in Megatron-DeepSpeed adds supports of ZeRO and tensor-slicing model parallelism (thus support larger model scale), uses a public and richer [Pile dataset](https://github.com/EleutherAI/the-pile) (user can also use their own data), together with some changes to the model architecture and training hyperparameters as described in [this paper](https://arxiv.org/abs/1909.08053). As a result, the BERT models trained by the new example is able to provide better MNLI results than original BERT, but with a slightly different model architecture and larger computation requirements. If you want to train a larger-scale or better quality BERT-style model, we recommend to follow the new example in Megatron-DeepSpeed. If your goal is to strictly reproduce the original BERT model, we recommend to follow the example under DeepSpeedExamples/bing_bert as described below. On the other hand, the tutorial below helps explaining how to integrate DeepSpeed into a pre-training codebase, regardless of which BERT example you use.
+{: .notice--info}
+
 In this tutorial we will apply DeepSpeed to pre-train the BERT
 (**B**idirectional **E**ncoder **R**epresentations from **T**ransformers),
 which is widely used for many Natural Language Processing (NLP) tasks. The
diff --git a/docs/_tutorials/cifar-10.md b/docs/_tutorials/cifar-10.md
index 11a05a78a7494c989d60ebb9b134dd8d87432b4f..74ee04502f18d52f0b7786e7f6a84abe80b8d260 100644
--- a/docs/_tutorials/cifar-10.md
+++ b/docs/_tutorials/cifar-10.md
@@ -140,7 +140,8 @@ Here we initialize DeepSpeed with CIFAR-10 model (`net`), `args`, `parameters` a
 After initializing DeepSpeed, the original `device` and `optimizer` are removed:
 
 ```python
- #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+ #from deepspeed.accelerator import get_accelerator
+ #device = torch.device(get_accelerator().device_name(0) if get_accelerator().is_available() else "cpu")
  #net.to(device)
 
  #optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
diff --git a/docs/_tutorials/comms-logging.md b/docs/_tutorials/comms-logging.md
new file mode 100644
index 0000000000000000000000000000000000000000..52d93eda05bcca368859151b7be271ddf5ff9d8f
--- /dev/null
+++ b/docs/_tutorials/comms-logging.md
@@ -0,0 +1,116 @@
+---
+title: "Communication Logging"
+excerpt: "Log all DeepSpeed communication calls"
+tags: profiling performance-tuning
+---
+
+In this tutorial, we introduce DeepSpeed communication logging and provide examples of its usage.
+
+  - [Overview](#overview)
+  - [Usage](#usage)
+
+## Overview
+
+NOTE: All logging communication calls are synchronized in order to provide accurate timing information. This may hamper performance if your model heavily uses asynchronous communication operations.
+
+Logging communication calls is vital to ensure networking resources are fully utilized. The DeepSpeed communication logger enables the detection and logging of all communication operations launched under `deepspeed.comm`. Each communication operation can all be directly printed to the console immediately after completion (via the `verbose` config option), or a summary may be printed with a call to `deepspeed.comm.log_summary()` in the client code at the completion of training, an epoch, after N training iterations, etc.
+
+## Usage
+
+Communication logging in DeepSpeed is configured within the deepspeed [configuration file](/docs/config-json/#communication-logging). DeepSpeed will automatically log communication either all operations (`prof_all`), or user-specified operations (`prof_ops`).
+
+  - [Configuration Setup](#configuration-setup)
+  - [Verbose Logging](#verbose-logging)
+  - [Log Summaries](#log-summaries)
+
+### Configuration Setup
+
+Communication logging can be configured in the DeepSpeed [configuration file](/docs/config-json/#communication-logging). Communication logging can be enabled by adding the following field to DeepSpeed's configuration json file. Refer to [Communication Logging](/docs/config-json/#communication-logging) for details.
+
+```json
+"comms_logger": {
+  "enabled": true,
+  "verbose": false,
+  "prof_all": true,
+  "debug": false
+}
+```
+
+There are currently two ways to view communication log records:
+
+1. Print all communication operations with `verbose` config option. See [Verbose Logging](#verbose-logging)
+2. (Recommended) Print log summary with `deepspeed.comm.log_summary()` function call. See [Log Summaries](#log-summaries)
+
+### Verbose Logging
+
+If the `enabled` configuration option is selected, all communication operations will be immediately printed to the console. This mode is intended for detailed debugging, and is not recommended for most users. The following is an example snippet of `verbose` output:
+
+```
+[2022-06-26 01:39:55,722] [INFO] [logging.py:69:log_dist] [Rank 0] rank=0 | comm op: reduce_scatter_base | time (ms): 9.46 | msg size: 678.86 MB | algbw (Gbps): 1204.52  | busbw (Gbps): 1129.23
+[2022-06-26 01:39:56,470] [INFO] [logging.py:69:log_dist] [Rank 0] rank=0 | comm op: all_gather_base | time (ms): 0.11 | msg size: 6.0 MB | algbw (Gbps): 954.41  | busbw (Gbps): 894.76
+[2022-06-26 01:39:56,471] [INFO] [logging.py:69:log_dist] [Rank 0] rank=0 | comm op: all_gather_base | time (ms): 0.08 | msg size: 6.0 MB | algbw (Gbps): 1293.47  | busbw (Gbps): 1212.63
+```
+
+For advanced users, the `debug` option will append the calling function of each communication operation to that operation's `log_name`. See [Log Summaries](#log-summaries) for an example of a `deepspeed.comm.log_summary()` call with `debug` enabled.
+
+
+### Log Summaries
+
+It's recommended that users add a call to `deepspeed.comm.log_summary()` at training milestones (e.g. every epoch or N iterations). This enables high-level communication logging without having to sift through logs from `verbose`.
+
+The steps to add DeepSpeed communication log summaries are as follows:
+
+1. Modify configuration file with desired settings
+2. (Optional) If your application contains `torch.distributed` calls that you wish to log, import `deepspeed.comm` package and modify `torch.distributed` calls to use `deepspeed.comm` (Note: The `deepspeed.comm` collective and pt2pt APIs exactly match `torch.distributed`)
+3. Call `deepspeed.comm.log_summary`
+
+For example usage, see the following modified [DeepSpeedExamples/cifar](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar) example:
+
+```python
+# Step 2: (Optional) Import deepspeed.comm
+import deepspeed.comm as dist
+
+# Note that any communication operations using `import torch.distributed as dist` calls can remain unchanged, and will be automatically logged under deepspeed.comm!
+dist.all_reduce(tensor)
+
+for epoch in range(2):
+
+    running_loss = 0.0
+    for i, data in enumerate(trainloader):
+        pre = time.time()
+        inputs, labels = data[0].to(model_engine.local_rank), data[1].to(
+            model_engine.local_rank)
+        if fp16:
+            inputs = inputs.half()
+        outputs = model_engine(inputs)
+        loss = criterion(outputs, labels)
+
+        model_engine.backward(loss)
+        model_engine.step()
+        post = time.time()
+    # Step 3: Call `deepspeed.comm.log_summary()`
+    dist.log_summary()
+```
+
+The following is a truncated example output of `deepspeed.comm.log_summary()` at the end of 10 iterations of Megatron-DeepSpeed with ZeRO-3:
+
+```
+Comm. Op            Message Size        Count               Total Latency(ms)   Avg Latency(ms)     tput_avg (Gbps)     busbw_avg (Gbps)
+broadcast
+                    2.0 KB              146                 11.12               0.08                0.43                0.41
+                    98.25 MB            1                   8317.12             8317.12             0.20                0.19
+reduce_scatter_base
+                    678.86 MB           40                  602.29              9.69                1468.06             1376.31
+```
+
+
+And the following is a call to `deepspeed.comm.log_summary` under the same configuration with `debug` enabled:
+
+```
+Comm. Op            Message Size        Count               Total Latency(ms)   Avg Latency(ms)     tput_avg (Gbps)     busbw_avg (Gbps)
+broadcast | [Caller Func: _broadcast_model]
+                    2.0 KB              146                 9.39                0.06                0.52                0.48
+                    98.25 MB            1                   8540.60             8540.60             0.19                0.18
+reduce_scatter_base | [Caller Func: reduce_scatter_fn]
+                    678.86 MB           80                  1527.17             13.94               1211.75             1136.01
+```
diff --git a/docs/_tutorials/curriculum-learning.md b/docs/_tutorials/curriculum-learning.md
index 938955ab57cc8625bcf10c08b6d2c8bf6d177785..161c29cfc04c1deacb73184dd83abec2484a6fda 100644
--- a/docs/_tutorials/curriculum-learning.md
+++ b/docs/_tutorials/curriculum-learning.md
@@ -3,6 +3,10 @@ title: "Curriculum Learning: A Regularization Method for Efficient and Stable Bi
 tags: training pre-training
 ---
 
+**Watch out!**
+On 12/12/2022, we released DeepSpeed Data Efficiency Library which provides a more general curriculum learning support. This legacy curriculum learning feature below is still supported but we recommend to use the Data Efficiency Library ([tutorial](/tutorials/data-efficiency/)).
+{: .notice--warning}
+
 **Note:**
 This tutorial was updated on 10/29/2021. Changes include: 1) A more detailed tuning strategy. 2) Pipeline parallelism support. 3) Token-based learning rate decay. 4) A new GPT-2 example at [github.com/microsoft/Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed). See details below.
 {: .notice--info}
diff --git a/docs/_tutorials/data-efficiency.md b/docs/_tutorials/data-efficiency.md
new file mode 100644
index 0000000000000000000000000000000000000000..329e3bb89e2fbdfb6c874412d632c06988429348
--- /dev/null
+++ b/docs/_tutorials/data-efficiency.md
@@ -0,0 +1,100 @@
+---
+title: "DeepSpeed Data Efficiency: A composable library that makes better use of data, increases training efficiency, and improves model quality"
+tags: training pre-training
+---
+
+**What is DeepSpeed Data Efficiency:** DeepSpeed Data Efficiency is a library purposely built to make better use of data, increases training efficiency, and improves model quality.
+
+**Why use DeepSpeed Data Efficiency:** DeepSpeed Data Efficiency offers novel data efficiency techniques to achieve better training efficiency and/or better model quality. DeepSpeed Data Efficiency takes extensibility, flexibility, and composability into consideration, which makes it easier to customize the techniques, apply the techniques to various training tasks, and compose multiple techniques together. We highly recommend you also to read [our blog](https://www.deepspeed.ai/2022/12/11/data-efficiency.html) to learn more about (at a high level) why we build DeepSpeed Data Efficiency and what benefits it provides to users. Additional technical details can be found in our papers, “[Random-LTD: Random and Layerwise Token Dropping Brings Efficient Training for Large-scale Transformers](https://arxiv.org/abs/2211.11586)” which describes the random-LTD technique, and “[DeepSpeed Data Efficiency: Improving Deep Learning Model Quality and Training Efficiency via Efficient Data Sampling and Routing](https://arxiv.org/abs/2212.03597)” which describes the curriculum learning technique and overall DeepSpeed Data Efficiency framework.
+
+**How to use DeepSpeed Data Efficiency:** In the following tutorial, the first two sections will describe the data efficiency techniques supported by the library. The third section will describe how to compose the two techniques to achieve even better training efficiency/model quality.
+
+## 1. Curriculum Learning
+
+### 1.1 What is Curriculum Learning
+Curriculum learning (proposed by [Yoshua Bengio et al.](https://dl.acm.org/doi/abs/10.1145/1553374.1553380)) aims to improve training convergence speed by presenting relatively easier or simpler examples earlier during training. Building a curriculum learning solution usually requires two components: the difficulty metric (i.e., how to quantify the difficulty of each data sample) and the pacing function (i.e., how to decide the curriculum difficulty range when sampling next training data batch).
+
+### 1.2 When to use Curriculum Learning
+Curriculum learning has been successfully applied to various training tasks (see details in for example [this survey paper](https://arxiv.org/abs/2010.13166)), and last year we also released a specific curriculum learning technique (sequence length warmup) for GPT-style model pretraining (see technical details in our paper “[The Stability-Efficiency Dilemma: Investigating Sequence Length Warmup for Training GPT Models](https://openreview.net/forum?id=JpZ5du_Kdh)” published in NeurIPS 2022 and the [tutorial for this legacy curriculum learning feature](/tutorials/curriculum-learning/)). This new general curriculum learning library inside DeepSpeed Data Efficiency enables users to employ curriculum learning to their models at **maximum extensibility**: users can easily analyze, index, and sample their training data based on various customizable strategies. Using this library, we were able to explore different CL strategies for GPT-3 and BERT pretraining and identify the best solution that provides up to **1.5x data saving** while still maintaining similar model quality.
+
+### 1.3 How to use Curriculum Learning
+
+#### 1.3.1 GPT-3 and BERT pretraining
+The `examples/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/microsoft/Megatron-DeepSpeed) includes our examples of how to apply curriculum learning to GPT-3 and BERT pretraining. There are 3 steps: data analysis, pretraining, and eval/finetuning.
+
+**Data analysis:** Curriculum learning requires a data analysis before pretraining that calculate the difficulty of each data sample (based on the metric provided by user), and build an index that map difficulty value to corresponding data samples. (There are exceptions: for example the truncation-based sequence length metric can be achieved by data postprocessing without data analysis.) We provide a data analyzer to perform the offline CPU-only data analysis.
+
+`examples/data_efficiency/gpt/ds_analyze_*.sh` and `examples/data_efficiency/bert/ds_analyze_*.sh` are example scripts for GPT-3 and BERT's data analysis. Our data analyzer employs a simple Map-Reduce scheme. First, at the Map stage the `ds_analyze_*_data_map.sh` is used to split the dataset and compute the difficulty value for each data sample. User would need to provide a function to compute the metric (we implement ours in `examples/data_efficiency/analyze_data.py`), the raw training dataset, and other configurations such as number of CPU nodes and number of threads per node. Then the data analyzer will automatically splits the dataset based on number of workers, compute the difficulty values in a batched fashion, and write the results to two indexes: one index maps each data sample to its difficulty value, and another index maps each distinct difficulty value to the corresponding samples. Second, at the Reduce stage the `ds_analyze_*_data_reduce.sh` is used to merge the index files produced by all workers. One thing to note is that in order to enable speedup by distribution yet still being able to merge all the output, the Map stage will potentially generate a lot of output files, which is proportional to number of CPU nodes, number of threads per node, and number of possible metric values. Thus to avoid generating too much output files, we recommend to start with a smaller number of nodes/threads (in the output log we provide an estimate required time for users to judge if they want to increase number of workers), and we recommend to limit number of possible difficulty values when designing your difficulty metric (our experience shows that a few thousands of distinct values is already sufficient to enjoy the benefit of curriculum learning).
+
+**Pretraining** `examples/data_efficiency/gpt/pretrain` and `examples/data_efficiency/bert/pretrain` include the example pretraining scripts with curriculum learning feature. Several changes are needed to enable curriculum learning during pretraining: (1) User need to provide a DeepSpeed json config file which includes configurations for curriculum learning (see [list of configuration](/docs/config-json/#data-efficiency) for details). We provide tested example configurations in `examples/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_run.sh` and `examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh`. (2) When initializing the DeepSpeed engine via `deepspeed.initialize`, user needs to provide the train dataset and use the dataloader returned by the initialization (this dataloader includes the curriculum learning capability). We provide an example implementation of this change in `megatron/training.py` function `setup_model_and_optimizer`. (3) If the curriculum learning metric requires data postprocessing (such as truncation-based sequence length), user needs to use the DeepSpeed engine's `set_data_post_process_func` API to provide the postprocessing function. We provide an example implementation of this change in `megatron/training.py`, `pretrain_bert.py`, and `pretrain_gpt.py`. (4) If the curriculum learning metric requires a custom scheduling strategy (the pacing function), user needs to use the DeepSpeed engine's `set_custom_curriculum_learning_schedule` API to provide the function to update the max accepted difficulty during training. DeepSpeed engine will provide a global train step input to this callback function.
+
+**Eval/finetuning** `examples/data_efficiency/gpt/eval/` and `examples/data_efficiency/bert/finetune` include the example scripts for GPT-3 model's zero-/few-shot evaluation and BERT model's finetuning. Our [paper](https://arxiv.org/abs/2212.03597) includes the reference eval/finetune results if you follow our example scripts to perform the pretraining/eval/finetuning.
+
+#### 1.3.2 GPT-2 finetuning
+The `data_efficiency/gpt_finetuning` directory in our [DeepSpeedExamples repo](https://github.com/microsoft/DeepSpeedExamples) includes our examples of how to apply curriculum learning to GPT-2 finetuning. `data_efficiency/gpt_finetuning/finetune/ds_finetune_gpt2_run.sh` is the example finetuning script. For CL metrics that require data analysis (e.g., the vocabulary rarity metric), you need to first use ```data_efficiency/gpt_finetuning/finetune/ds_analyze_gpt_data_*``` to analyze and index the dataset, similar to the GPT-3 pre-training case described above in 1.3.1.
+
+## 2. Random layerwise token dropping (random-LTD)
+
+### 2.1 What is random-LTD
+Random-LTD is an efficient token drop method applied to each layer with random assignment. Precisely, for each layer, as compared to the baseline, random-LTD randomly selects a subset of the tokens and feeds them into the transformer layer. Afterward, we combine the output of transformer layer with the dropped tokens to recover the full sequence length. Thus, the next layer still receives the full sequence and can repeat this process. For more technical details please read [our random-LTD paper](https://arxiv.org/abs/2211.11586).
+
+### 2.2 When to use random-LTD
+When you want to pretrain/fine-tune a transformer-based model, it is always a good idea to try random-LTD, as it can achieve a better performance than the standard baseline training given the same amount of computational cost. If you have limited resources, random-LTD achieves similar accuracy as the original baseline method with up to 33.3% theoretical cost saving and up to 25.6% wall-clock time saving. Particularly, if you need to train a much larger model with >=24 layers and with >=2048 sequence length, our method will be much more efficient than baseline.
+
+### 2.3 How to use random-LTD
+
+#### 2.3.1 GPT-3 and BERT pretraining
+The `examples/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/microsoft/Megatron-DeepSpeed) includes our examples of how to apply random-LTD to GPT-3 and BERT pretraining.
+
+`examples/data_efficiency/gpt/pretrain` and `examples/data_efficiency/bert/pretrain` include the example pretraining scripts with random-LTD feature. Several changes are needed to enable random-LTD during pretraining: (1) User need to provide a DeepSpeed json config file which includes configurations for random-LTD (see [list of configuration](/docs/config-json/#data-efficiency) for details). We provide tested example configurations in `examples/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_run.sh` and `examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh`. (2) After initializing the DeepSpeed engine via `deepspeed.initialize`, user needs to use the `convert_to_random_ltd` API to convert and wrap the model layers in order to enable the random-LTD feature. We provide an example implementation of this change in `megatron/training.py` function `setup_model_and_optimizer`. (3) In order for random-LTD to understand the input argument mapping of the forward function, user need to change all the input arguments (except the hidden_states input) into keyword/named argument. For example, in `megatron/model/transformer.py` we changed the forward function from `def forward(self, hidden_states, attention_mask, encoder_output=None, enc_dec_attn_mask=None, layer_past=None, get_key_value=False):` to `def forward(self, hidden_states, attention_mask=None, encoder_output=None, enc_dec_attn_mask=None, layer_past=None, get_key_value=False):`. (4) When saving model checkpoints, (especially if the state dictionary has non-traditional structure) user needs to use the `remove_random_ltd_state_dict` API to convert the random-LTD-wrapped layers back to original model layers. We provide an example implementation of this change in `megatron/model/language_model.py`.
+
+For eval/finetuning of the pretrained model, see [previous section](#131-gpt-3-and-bert-pretraining) about how to use our example scripts.
+
+#### 2.3.2 GPT-2 and ViT finetuning
+The `data_efficiency` directory in our [DeepSpeedExamples repo](https://github.com/microsoft/DeepSpeedExamples) includes our examples of how to apply random-LTD to GPT-2 and ViT finetuning.
+
+Just like pretraining case, similar changes are required to enable random-LTD for finetuning: (1) DeepSpeed json config file. (2) Use the `convert_to_random_ltd` API to convert and wrap the model layers. (3) When saving model checkpoints, use the `remove_random_ltd_state_dict` API to convert the random-LTD-wrapped layers back to original model layers.
+
+One can run our GPT finetuning example by:
+
+```shell
+DeepSpeedExamples/data_efficiency/gpt_finetuning$ pip install -r requirement.txt
+DeepSpeedExamples/data_efficiency/gpt_finetuning$ bash ./bash_script/run_base_random_ltd.sh
+DeepSpeedExamples/data_efficiency/gpt_finetuning$ bash ./bash_script/run_medium_random_ltd.sh
+```
+
+And the reference final result is:
+
+```shell
+For run_base_random_ltd.sh:
+End of training epoch 3 step 1344 consumed_token 2148032 best perplexity 22.552324221233757 time 0.17486039188173083 hr
+
+For run_medium_random_ltd.sh:
+End of training epoch 3 step 1373 consumed_token 2147024 best perplexity 17.332243199130996 time 0.4661190489927928 hr
+```
+
+One can run our ViT finetuning example by:
+
+```shell
+DeepSpeedExamples/data_efficiency/vit_finetuning$ pip install -r requirement.txt
+DeepSpeedExamples/data_efficiency/vit_finetuning$ bash ./bash_script/run_cifar.sh
+DeepSpeedExamples/data_efficiency/vit_finetuning$ bash ./bash_script/run_imagenet.sh
+```
+
+And the reference final result is:
+
+```shell
+For run_cifar.sh:
+13 epoch at time 480.6546013355255s | researved_length 197
+iter 5474 | LR [0.0001]| val_acc 97.97000122070312 | layer_token 305784192
+```
+
+## 3. Composing curriculum learning and random-LTD to achieve more
+
+### 3.1 GPT-3 and BERT pretraining
+The `examples/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/microsoft/Megatron-DeepSpeed) includes our examples of how to compose curriculum learning random-LTD, and apply both of them to GPT-3 and BERT pretraining.
+
+The changes needed are the same as described in previous two sections, since DeepSpeed Data Efficiency already handles the complexity when composing the two techniques. However, one thing to note is that since both random-LTD and some of the curriculum learning metrics will change the sequence length, it could require some extra code to calculate the effective sequence length at each step. We provide an example implementation of this change in `megatron/training.py` function `train` where we calculate the `actual_seq_length`.
+
+#### 3.2 GPT-2 finetuning
+The `data_efficiency/gpt_finetuning` directory in our [DeepSpeedExamples repo](https://github.com/microsoft/DeepSpeedExamples) includes our examples of how to compose curriculum learning random-LTD for GPT-2 finetuning. `data_efficiency/gpt_finetuning/finetune/ds_finetune_gpt2_run.sh` is the example finetuning script.
diff --git a/docs/_tutorials/flops-profiler.md b/docs/_tutorials/flops-profiler.md
index b90a55efcada05fdde6773431bc61b3cd4e72d22..169bfb18d4422223ff2f46efa6be2104b5966940 100644
--- a/docs/_tutorials/flops-profiler.md
+++ b/docs/_tutorials/flops-profiler.md
@@ -184,7 +184,7 @@ When using DeepSpeed for model training, the profiler can be configured in the d
 
 #### Example: Megatron-LM
 
-For information on running Megatron-LM with DeepSpeed, please refer to our tutorial [Megatron-LM](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM).
+For information on running Megatron-LM with DeepSpeed, please refer to our tutorial [Megatron-LM](https://github.com/microsoft/DeepSpeedExamples/tree/master/megatron/Megatron-LM).
 
 An example output of 12-layer Megatron-LM model (`hidden_size = 8192, num_attention_heads = 32, batch_size = 1024, seq_length = 1024`) is shown below.
 
diff --git a/docs/_tutorials/gan.md b/docs/_tutorials/gan.md
old mode 100644
new mode 100755
diff --git a/docs/_tutorials/inference-tutorial.md b/docs/_tutorials/inference-tutorial.md
index 253bb7092467242974b2b06682f18d50b86f4e5c..176662296ad9d371a08c22b347366db883125007 100644
--- a/docs/_tutorials/inference-tutorial.md
+++ b/docs/_tutorials/inference-tutorial.md
@@ -11,7 +11,7 @@ DeepSpeed provides a seamless inference mode for compatible transformer based mo
 
 For inference with DeepSpeed, use `init_inference` API to load the model for inference. Here, you can specify the MP degree, and if the model has not been loaded with the appropriate checkpoint, you can also provide the checkpoint description using a `json` file or the checkpoint path.
 
-To inject the high-performance kernels, you need to set the `replace_with_kernel_inject` to True and pass int the `replace_method` as `'auto'` for the compatible models, or define a new policy in [replace_policy class](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py) and pass in the `injection_policy` that specifies the different parameters of a Transformer layer, such as attention and feed-forward parts. The `injection_policy` shows the mapping between the parameters of the original layer implementation with the inference-customized Transformer layer.
+To inject the high-performance kernels, you need to set the `replace_with_kernel_inject` to True for the compatible models. For models not supported by DeepSpeed, the users can submit a PR that defines a new policy in [replace_policy class](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py) that specifies the different parameters of a Transformer layer, such as attention and feed-forward parts. The policy classes in DeepSpeed create a mapping between the parameters of the original user-supplied layer implementation with DeepSpeed's inference-optimized Transformer layer.
 
 ```python
 # create the model
@@ -28,7 +28,6 @@ ds_engine = deepspeed.init_inference(model,
                                  mp_size=2,
                                  dtype=torch.half,
                                  checkpoint=None if args.pre_load_checkpoint else args.checkpoint_json,
-                                 replace_method='auto',
                                  replace_with_kernel_inject=True)
 model = ds_engine.module
 output = model('Input String')
@@ -111,8 +110,7 @@ generator = pipeline('text-generation', model='EleutherAI/gpt-neo-2.7B',
 generator.model = deepspeed.init_inference(generator.model,
                                            mp_size=world_size,
                                            dtype=torch.float,
-                                           replace_method='auto',
-					   replace_with_kernel_inject=True)
+                                           replace_with_kernel_inject=True)
 
 string = generator("DeepSpeed is", do_sample=True, min_length=50)
 if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
diff --git a/docs/_tutorials/large-models-w-deepspeed.md b/docs/_tutorials/large-models-w-deepspeed.md
index ea6145cb6ae53d66c0a07a6e193cbca5243cb8f6..21b9956decc29bf737414d591f0b90359a02ee5c 100644
--- a/docs/_tutorials/large-models-w-deepspeed.md
+++ b/docs/_tutorials/large-models-w-deepspeed.md
@@ -24,7 +24,7 @@ At a broad level, there are two primary paths to training a large model:
 
 Since, ZeRO is a replacement to data parallelism, it offers a seamless integration that does not require model code refactoring for existing data-parallel models. For majority of cases, ZeRO based technologies offers model scalability, training throughput efficiency without compromising ease of use.
 
-**3D Parallelism based technologies**: 3D Parallelism refers to a combination of three different forms of parallel technologies namely tensor-slicing, pipeline-parallelism, and data parallelism (or ZeRO powered data parallelism). Combing these three forms allows for harnessing the strength of each of these technologies without the drawback of any. 3D Parallelism enables DeepSeed to achieve excellent training throughput efficiency in the scenarios where relying on ZeRO based technologies alone might be insufficient. However, 3D parallelism requires non-trivial model code refactoring, and therefore a careful consideration is important to identify cases where 3D-Parallelism can bring non-trivial throughput benefits.
+**3D Parallelism based technologies**: 3D Parallelism refers to a combination of three different forms of parallel technologies namely tensor-slicing, pipeline-parallelism, and data parallelism (or ZeRO powered data parallelism). Combing these three forms allows for harnessing the strength of each of these technologies without the drawback of any. 3D Parallelism enables DeepSpeed to achieve excellent training throughput efficiency in the scenarios where relying on ZeRO based technologies alone might be insufficient. However, 3D parallelism requires non-trivial model code refactoring, and therefore a careful consideration is important to identify cases where 3D-Parallelism can bring non-trivial throughput benefits.
 
 ## Deciding which technology to use
 
diff --git a/docs/_tutorials/megatron.md b/docs/_tutorials/megatron.md
index 7d81ecdcd28d53c36c514cb186d8fd5a6a88824e..2977f577302946e799a0ecd143286d6f189d3251 100644
--- a/docs/_tutorials/megatron.md
+++ b/docs/_tutorials/megatron.md
@@ -19,7 +19,7 @@ reduction_** from using DeepSpeed.
 
 ## Training GPT-2 with the Original Megatron-LM
 
-We've copied the original model code from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) into DeepSpeed [Megatron-LM](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM-v1.1.5-ZeRO3) and made it available as a submodule. To download, execute:
+We've copied the original model code from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) into DeepSpeed [Megatron-LM](https://github.com/microsoft/Megatron-DeepSpeed) and made it available as a submodule. To download, execute:
 ```bash
 git submodule update --init --recursive
 ```
diff --git a/docs/_tutorials/mixture-of-experts-inference.md b/docs/_tutorials/mixture-of-experts-inference.md
index 42df78dd0cfcb16e521d9060474f8c7ae61cf051..7a75c84935d772659792bfbd06ba0c8f6f563363 100644
--- a/docs/_tutorials/mixture-of-experts-inference.md
+++ b/docs/_tutorials/mixture-of-experts-inference.md
@@ -23,7 +23,7 @@ In this part, we elaborate the usage of MoE inference support in the DeepSpeed l
 
 ### Initializing for Inference
 
-For inference with DeepSpeed-MoE, use `init_inference` API to load the DeepSpeed MoE model for inference. Here, you can specify the model-parallelism/tensor-slicing degree (mp_size), expert parallelism degree (ep_size), and number of experts (moe_exeperts). We create various process groups based on minimum of the world\_size (total number of GPUs) and expert parallel size. By using this group, we can partition the experts among expert-parallel GPUs. If number of experts is lower than total number of GPUs, DeepSpeed-MoE leverages expert-slicing for partitioning the expert parameters between the expert-parallel GPUs. Furthermore, if the model has not been loaded with the appropriate checkpoint, you can also provide the checkpoint description using a `json` file or simply pass the `'checkpoint'` path to load the model. To inject the high-performance inference kernels, you can pass int the `replace_method` as `'auto'` and set the `replace_with_kernel_inject` to True.
+For inference with DeepSpeed-MoE, use `init_inference` API to load the DeepSpeed MoE model for inference. Here, you can specify the model-parallelism/tensor-slicing degree (mp_size), expert parallelism degree (ep_size), and number of experts (moe_exeperts). We create various process groups based on minimum of the world\_size (total number of GPUs) and expert parallel size. By using this group, we can partition the experts among expert-parallel GPUs. If number of experts is lower than total number of GPUs, DeepSpeed-MoE leverages expert-slicing for partitioning the expert parameters between the expert-parallel GPUs. Furthermore, if the model has not been loaded with the appropriate checkpoint, you can also provide the checkpoint description using a `json` file or simply pass the `'checkpoint'` path to load the model. To inject the high-performance inference kernels, you can set `replace_with_kernel_inject` to True.
 
 ```python
 
@@ -44,7 +44,6 @@ ds_engine = deepspeed.init_inference(moe_model,
                                      dtype=torch.half,
                                      moe_experts=args.num_experts,
                                      checkpoint=args.checkpoint_path,
-                                     replace_method='auto',
                                      replace_with_kernel_inject=True,)
 model = ds_engine.module
 output = model('Input String')
@@ -55,7 +54,7 @@ output = model('Input String')
 Here, we show a text-generation example using an MoE model for which we can specify the model-parallel size and number of experts.
 DeepSpeed inference-engine takes care of creating the different parallelism groups using the tensor-slicing degree, number of experts, and the total number of GPUs used for running the MoE model. Regarding the expert parameters, we first use the expert-parallelism to assign each group of experts to one GPU. If number of GPUs is higher than number of experts, we use expert-slicing to partition each expert vertically/horizontally across the GPUs.
 
-Let's take a look at some of the parameters passed to run our example. Please refer to [DeepSpeed-Example](https://github.com/microsoft/Megatron-DeepSpeed/blob/moe/examples/generate_text.sh) for a complete generate-text inference example.
+Let's take a look at some of the parameters passed to run our example. Please refer to [DeepSpeed-Example](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples/generate_text.sh) for a complete generate-text inference example.
 
 
 ```bash
@@ -97,7 +96,7 @@ generate_samples_gpt.py \
        --num-attention-heads 16 \
        --max-position-embeddings 1024 \
        --tokenizer-type GPT2BPETokenizer \
-       --load $checpoint_path \
+       --load $checkpoint_path \
        --fp16 \
        --ds-inference \
 ```
diff --git a/docs/_tutorials/mixture-of-experts-nlg.md b/docs/_tutorials/mixture-of-experts-nlg.md
old mode 100644
new mode 100755
index e43cb83d0ed9cd2f8ebe81f07460b6a179900b6e..c88df2df75e0a209c9f438e4410f51f2d8c1878c
--- a/docs/_tutorials/mixture-of-experts-nlg.md
+++ b/docs/_tutorials/mixture-of-experts-nlg.md
@@ -7,7 +7,7 @@ In this tutorial, we introduce how to apply DeepSpeed Mixture of Experts (MoE) t
 
 ## 1. Installation
 
-You would need to install DeepSpeed v0.6.0 or higher to use the MoE feature. The MoE for NLG model examples are in the [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) repo (currently under [the moe branch](https://github.com/microsoft/Megatron-DeepSpeed/tree/moe) but later could be merged to main branch).
+You would need to install DeepSpeed v0.6.0 or higher to use the MoE feature. The MoE for NLG model examples are in the [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) repo under the MoE folder.
 
 ## 2. Training NLG+MoE models
 
@@ -15,7 +15,7 @@ You would need to install DeepSpeed v0.6.0 or higher to use the MoE feature. The
 To apply MoE to the GPT-style model, we made several changes in Megatron framework, mostly in `megatron/model/` where we add the MoE layers into the model.
 
 ### 2.2. Pre-training the Standard MoE model
-We provide example training scripts under [examples/MoE](https://github.com/microsoft/Megatron-DeepSpeed/tree/moe/examples/MoE) which we used to perform the experiments in our [Blog]({{ site.press_release_v6 }}). There are a few new hyperparameters for standard MoE model:
+We provide example training scripts under [examples/MoE](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/MoE) which we used to perform the experiments in our [Blog]({{ site.press_release_v6 }}). There are a few new hyperparameters for standard MoE model:
 
 `--num-experts`: the number of experts per MoE layer. In our experiments we set it to 128. Larger number of experts tend to provide better convergence, but it's a diminishing return.
 
@@ -30,7 +30,7 @@ We provide example training scripts under [examples/MoE](https://github.com/micr
 
 
 ### 2.3. Pre-training the PR-MoE model
-PR-MoE is a new designed MoE models, standing for Pyramid-Residual-MoE, which improves the parameter efficiency up to 3x as compared to standard MoE. Please see our [Blog]({{ site.press_release_v6 }}) for more details. We provide example training scripts under [examples/MoE](https://github.com/microsoft/Megatron-DeepSpeed/tree/moe/examples/MoE). There are a few different hyperparameters for PR-MoE model compared to standard MoE:
+PR-MoE is a new designed MoE models, standing for Pyramid-Residual-MoE, which improves the parameter efficiency up to 3x as compared to standard MoE. Please see our [Blog]({{ site.press_release_v6 }}) for more details. We provide example training scripts under [examples/MoE](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/MoE). There are a few different hyperparameters for PR-MoE model compared to standard MoE:
 
 `--num-experts`: Instead of providing a single number, to enable Pyramid-MoE, you need to provide a list, whose length is the same as the number of MoE layers. We suggest to use more experts in the latter stage (close to output) of the model.
 
@@ -67,4 +67,4 @@ MoS, standing for Mixture-of-Students, is a staged distillation-based technique
 
 In addition to the new parameters above, we observe that using the teacher PR-MoE during the entire training process may adversely impact the final student model accuracy. In our experiments, we use a staged distillation method by stopping distillation early in the training process (e.g., after 400K steps) and perform optimization only against the standard language modeling loss for the rest of the training.
 
-We provide example training scripts under [examples/MoE](https://github.com/microsoft/Megatron-DeepSpeed/tree/moe/examples/MoE). Details of our parameter settings can be found in the example training scripts. The performance results of MoS can be seen from our [blog post](https://www.microsoft.com/en-us/research/blog/deepspeed-powers-8x-larger-moe-model-training-with-high-performance/) and our [paper](https://arxiv.org/abs/2201.05596).
+We provide example training scripts under [examples/MoE](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/MoE). Details of our parameter settings can be found in the example training scripts. The performance results of MoS can be seen from our [blog post](https://www.microsoft.com/en-us/research/blog/deepspeed-powers-8x-larger-moe-model-training-with-high-performance/) and our [paper](https://arxiv.org/abs/2201.05596).
diff --git a/docs/_tutorials/model-compression.md b/docs/_tutorials/model-compression.md
new file mode 100644
index 0000000000000000000000000000000000000000..20f2e6a6b25b526ce3922164c9447f04dad416fd
--- /dev/null
+++ b/docs/_tutorials/model-compression.md
@@ -0,0 +1,441 @@
+---
+title: "DeepSpeed Model Compression Library"
+tags: model-compression
+---
+
+**What is DeepSpeed Compression:** DeepSpeed Compression is a library purposely built to make it easy to compress models for researchers and practitioners while delivering faster speed, smaller model size, and significantly reduced compression cost.
+
+**Why use DeepSpeed Compression:**  DeepSpeed Compression offers novel state-of-the-art compression techniques to achieve faster model compression with better model quality and lower compression cost. DeepSpeed Compression also takes an end-to-end approach to improve the computation efficiency of compressed models via a highly optimized inference engine. Furthermore, our library has multiple built-in state-of-the-art compression methods. It supports the synergistic composition of these methods and the system optimizations, offering the best of both worlds while allowing a seamless and easy-to-use pipeline for efficient DL model inference. We highly recommend you also to read [our blog](https://www.microsoft.com/en-us/research/blog/deepspeed-compression-a-composable-library-for-extreme-compression-and-zero-cost-quantization/) to learn more about (at a high level) why we build DeepSpeed Compression and what benefits it provides to users.
+
+**How to use DeepSpeed Compression:** The first section General Tutorial will describe the compression methods supported by the library. The following sections will describe our research work on how to compose different compression methods to perform [zero-cost quantization (ZeroQuant)](#2-tutorial-for-zeroquant-efficient-and-affordable-post-training-quantization) and [extreme compression (XTC)](#3-tutorial-for-xtc-simple-yet-effective-compression-pipeline-for-extreme-compression). Unless otherwise stated, experiment results listed below are based on NVIDIA A100 GPU, and we observe slightly different result numbers when using different GPU hardwares.
+
+## 1. General Tutorial
+To use DeepSpeed Compression library, you need to install DeepSpeed >= 0.7.0 following the [installation guide](/tutorials/advanced-install/). Currently the DeepSpeed Compression includes seven compression methods: layer reduction via knowledge distillation, weight quantization, activation quantization, sparse pruning, row pruning, head pruning, and channel pruning. In the following subsections, we will describe what these methods are, when to use them, and how to use them via our library.
+
+### 1.1 Layer Reduction
+**What is layer reduction**
+
+Neural networks are constructed from input layer, output layer and hidden layer. For example, the BERT-base language model consists of embedding layer (input layer), classification layer (output layer) and 12 hidden layers. Layer reduction means reducing the number of hidden layers while keeping the width of the network intact (i.e., it does not reduce the dimension of the hidden layer). This method can linearly reduce the inference latency of hidden layers regardless of the hardware and/or scenarios.
+
+**When to use layer reduction**
+
+If the model is very deep, you may consider using this method. It works much better when applying knowledge distillation. Layer reduction can be applied in both the pre-training and fine-tuning stages. The former generates a distilled task-agnostic model, while the latter generates a task-specific distilled model. In our XTC work ([paper](https://arxiv.org/abs/2206.01859), [tutorial](#3-tutorial-for-xtc-simple-yet-effective-compression-pipeline-for-extreme-compression)), we also discuss when to apply layer reduction.
+
+**How to use layer reduction**
+
+Layer reduction can be enabled and configured using the DeepSpeed config JSON file ([configuration details](/docs/config-json/#layer-reduction)). Users have the freedom to select any depth by `keep_number_layer` and any subset of the network layers by `teacher_layer`. In addition, users also can choose whether to reinitialize the input/output layers from the given model (teacher model) by `other_module_name`.
+
+To apply layer reduction for task-specific compression, we provide an example on how to do so for BERT fine-tuning. Layer reduction is about resetting the depth of network architecture and reinitialization of weight parameters, which happens before the training process. The example includes the following changes to the client code (`model_compression/bert/run_glue_no_trainer.py` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)):
+
+(1) When initial the model, the number of layers in the model config should be the same as `keep_number_layer` in DeepSpeed config JSON file. For Hugging Face BERT example, set `config.num_hidden_layers = ds_config["compression_training"]["layer_reduction"]["keep_number_layer"]`.
+
+(2) Then we need to re-initialize the model based on the DeepSpeed JSON configurations using the function `init_compression` imported from `deepspeed.compression.compress`.
+
+(3) During training, if KD is not used, nothing needs to be done. Otherwise, one needs to consider applying KD with the `teacher_layer` JSON configuration when calculating the difference between teacher’s and student’s output.
+
+One can run our layer reduction example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
+
+```shell
+DeepSpeedExamples/model_compression/bert$ pip install -r requirements.txt
+DeepSpeedExamples/model_compression/bert$ bash bash_script/layer_reduction.sh
+```
+
+And the final result is:
+
+```shell
+Epoch: 18 | Time: 12m 38s
+Clean the best model, and the accuracy of the clean model is acc/mm-acc:0.8340295466123281/0.8339096826688365
+```
+
+To apply layer reduction for task-agnostic compression, we provide an example on how to do so in the GPT pre-training stage.
+
+Step 1: Obtain the latest version of the [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed).
+
+Step 2: Enter `Megatron-DeepSpeed/examples/compression` directory.
+
+Step 3: Run the example bash script such as `ds_pretrain_gpt_125M_dense_cl_kd.sh`. The args related to the pre-training distillation are:
+
+(1)`--kd`, this enables knowledge distillation.
+
+(2)`--kd-beta-ce`, this specifies the knowledge distillation coefficient. You can often leave it set to the default value 1, but sometimes tuning this hyperparameter leads to better distillation results.
+
+(3)`--num-layers-teacher`, `—hidden-size-teacher`, `num-attention-heads-teacher`, these parameters specify the network configuration of the teacher model. Please make sure they match the teacher model dimensions in the checkpoint.
+
+(4)`--load-teacher`, this is where one specifies the teacher model checkpoint.
+
+(5)`--load`, this is where the initial checkpoint for the student model that is going to be loaded. By default, it will load the bottom layers of the teacher models for initialization, but you can pass your own checkpoints for initialization.
+
+Apart from the above configs, you may also need to modify the data path in the `data_options` so that the trainer knows the data location. To make things slightly easier, we provide several example scripts for running distillation for different model sizes, including 350M (`ds_pretrain_gpt_350M_dense_kd.sh`) and 1.3B models (`ds_pretrain_gpt_1.3B_dense_cl_kd.sh`). We also empirically found that a staged KD often led to a better pre-trained distilled model on downstream tasks. Therefore, we suggest an easy approach to early-stop KD by not setting `--kd` in the script provided (e.g., disabling KD in the remaining 40% of training).
+
+Step 4: After distilling the model, one can also choose to further quantize the distilled model by running the script `125M-L10-Int8-test-64gpu-distilled-group48.sh`, which quantizes both the weights and activations of a distilled model with INT8 quantizer (the weight and activation quantization are introduced in the following sections). note that you need to set the `-reset-iteration` flag when performing the quantization. We provide the zero-shot perplexity result from WikiText-2 and LAMBADA in the following table.
+
+| **GPT (125M)** | **#Layers** | **wikitex2 perplexity** | **LAMBADA** |
+| ---------- |---------- |---------- |---------- |
+| Uncompressed | 12 | 29.6 | 39.5 |
+| Quantization only | 12 | 29.8 | 39.7 |
+| Distillation only | 10 | 31.9 | 39.2 |
+| Distillation + quantization | 10 | 32.28 | 38.7 |
+
+### 1.2 Weight Quantization
+**What is weight quantization**
+
+Weight quantization maps the full precision weight (FP32/FP16) to the low bit ones, like INT8 and INT4. Quoted from [this Coursera lecture](https://www.coursera.org/lecture/machine-learning-modeling-pipelines-in-production/benefits-and-process-of-quantization-WAjyJ): “Quantization involves transforming a model into an equivalent representation that uses parameters and computations at a lower precision. This improves the model's execution performance and efficiency, but it can often result in lower model accuracy”.
+
+**When to use weight quantization**
+
+From one-side, again quoted from [this Coursera lecture](https://www.coursera.org/lecture/machine-learning-modeling-pipelines-in-production/benefits-and-process-of-quantization-WAjyJ): “Mobile and embedded devices have limited computational resources, so it's important to keep your application resource efficient. Depending on the task, you will need to make a trade-off between model accuracy and model complexity. If your task requires high accuracy, then you may need a large and complex model. For tasks that require less precision, it's better to use a smaller, less complex model.”. On the other hand, recent server accelerators, like GPU, support low-precision arithmetic. Therefore, combining weight quantization with activation quantization (introduced in later section) can offer better efficiency as well.
+
+**How to use weight quantization**
+
+Weight quantization can be enabled and configured using the DeepSpeed config JSON file ([configuration details](/docs/config-json/#weight-quantization)). The key configurations we would like to point out are:
+
+(1)`quantize_groups`, a group-wise weight matrix quantization: a weight matrix W is partitioned into multiple groups, and each group is quantized separately. See more details in [this paper](https://ojs.aaai.org/index.php/AAAI/article/view/6409).
+
+(2)`quantize_weight_in_forward` must be set to true for FP32 optimizer training and false for FP16.
+
+(3)`wq1`/`wq2`, users can expand more groups such as `wq3`, `wq4`, etc.
+
+(4)`start_bit` and `target_bit`, to simplify the first experiment we suggest to set them the same such that we apply quantization to the target bit once the iteration reaches `schedule_offset`.
+
+There are two changes to the client code (`model_compression/bert/run_glue_no_trainer.py` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)):
+
+(1) After initialization of the model, apply `init_compression` function to the model with DeepSpeed JSON configurations.
+
+(2) After training, apply `redundancy_clean` function to save the quantized weight.
+
+One can run our weight quantization example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
+
+```shell
+DeepSpeedExamples/model_compression/bert$ pip install -r requirements.txt
+DeepSpeedExamples/model_compression/bert$ bash bash_script/quant_weight.sh
+```
+
+And the final result is:
+
+```shell
+Epoch: 09 | Time: 27m 10s
+Clean the best model, and the accuracy of the clean model is acc/mm-acc:0.8414671421293938/0.8422497965825875
+```
+
+### 1.3 Activation Quantization
+**What is activation quantization**
+
+Activation means the input to each layer. Activation quantization maps the input from full/half precision to low precision. See more in [this blog](https://medium.com/@joel_34050/quantization-in-deep-learning-478417eab72b).
+
+**When to use activation quantization**
+
+It can improve computation efficiency similar to [weight quantization](#12-weight-quantization).
+
+**How to use activation quantization**
+
+Activation quantization can be enabled and configured using the DeepSpeed config JSON file ([configuration details](/docs/config-json/#activation-quantization)). Some of the components are same as weight quantization, such as `schedule_offset` and `quantization_type`. The key configurations we would like to point out are:
+
+(1)`range_calibration`, user has option to set dynamic or static. When using “dynamic”, the activation quantization groups will be automatically set to be token-wise (for Transformer-based models) and image-wise (for CNN-based models). See more in [our ZeroQuant paper](https://arxiv.org/abs/2206.01861) and the code (`deepspeed/compression/basic_layer.py` in [DeepSpeed](https://github.com/microsoft/DeepSpeed)).
+
+(2)`aq1`/`aq2`, users can expand more groups such as `aq3`, `aq4`, etc.
+
+The client code change is the same as [weight quantization](#12-weight-quantization).
+
+One can run our activation quantization example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
+
+```shell
+DeepSpeedExamples/model_compression/bert$ pip install -r requirements.txt
+DeepSpeedExamples/model_compression/bert$ bash bash_script/quant_activation.sh
+```
+
+And the final result is:
+
+```shell
+Epoch: 02 | Time: 28m 50s
+Clean the best model, and the accuracy of the clean model is acc/mm-acc:0.8375955170657158/0.8422497965825875
+```
+
+### 1.4 Pruning
+**What is pruning**
+
+Pruning aims to reduce the number of parameters and operations involved in generating a prediction by removing network connections. With pruning, you can lower the overall parameter count in the network (see more in [this Coursera lecture](https://www.coursera.org/lecture/machine-learning-modeling-pipelines-in-production/pruning-uNSOG)). We can divide the pruning strategy into two types: structured and unstructured pruning (see more in [this paper](https://arxiv.org/abs/1506.02626)).
+
+
+| **Method**            | **Type**     |
+| --------------------- | ------------ |
+| [Sparse pruning](#141-sparse-pruning)  | Unstructured |
+| [Row pruning](#142-row-pruning)     | Structured    |
+| [Head pruning](#143-head-pruning)     | Structured    |
+| [Channel pruning](#144-channel-pruning) | Structured    |
+
+#### 1.4.1 Sparse Pruning
+**What is sparse pruning**
+
+Sparse pruning means we set some of the elements in each weight matrix with zero values. There is no structure pattern in the zero values. One way to perform pruning is based on the absolute value of the weight parameters, see for instance [this paper](https://arxiv.org/abs/1506.02626).
+
+**When to use sparse pruning**
+
+If your model is significantly over-parameterized, you may consider using sparse pruning. However, to see the real benefit of hardware computation efficiency, the density ratio (percentage of weights to keep after pruning) must be considerably low.
+
+**How to use sparse pruning**
+
+Sparse pruning can be enabled and configured using the DeepSpeed config JSON file ([configuration details](/docs/config-json/#sparse-pruning)). The key configurations we would like to point out are:
+
+(1)`schedule_offset`, we empirically find that when using `method: topk`, it’s better to set the `schedule_offset` to a large value such as 10% of the total training steps.
+
+(2)`method`, we support L1 norm and topk methods. Users are welcome to contribute more methods.
+
+(3)`sp1`, users can expand more groups such as `sp2`, `sp3`, etc.
+
+(4)`dense_ratio`, for unstructured sparse pruning, the dense ratio could be less than 0.1 for BRET-base model while still yielding a good accuracy. For ResNet-50, the dense ratio could be as low as 0.3 while still having good accuracy on ImageNet.
+
+The client code change is the same as [weight quantization](#12-weight-quantization).
+
+One can run our sparse pruning example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
+
+```shell
+DeepSpeedExamples/model_compression/bert$ pip install -r requirements.txt
+DeepSpeedExamples/model_compression/bert$ bash bash_script/pruning_sparse.sh
+```
+
+And the final result is:
+
+```shell
+Epoch: 02 | Time: 26m 14s
+Clean the best model, and the accuracy of the clean model is acc/mm-acc:0.8416709118695873/0.8447925142392189
+```
+
+#### 1.4.2 Row Pruning
+**What is row pruning**
+
+Row pruning sets all the elements in certain rows of the weight matrix with zero values. If a row is pruned, all elements in that row are set to zero.
+
+**When to use row pruning**
+
+Row pruning can be beneficial to hardware speedup, much better than sparse pruning (but may result in larger accuracy loss compared to sparse pruning). It is a feature designed for two back-to-back linear layers (e.g., Feed Forward Network in Transformers). As such, we suggested using row pruning for the first linear layer (i.e., the `intermediate.dense` layer for BERT). Reducing the row dimension of this matrix can help to reduce the column of the follow-up matrix (i.e., `layer.\\w+.output.dense` layer for BERT). Row pruning would also work for other kinds of linear layers.
+
+**How to use row pruning**
+
+Row pruning can be enabled and configured using the DeepSpeed config JSON file ([configuration details](/docs/config-json/#row-pruning)). The key configurations we would like to point out are:
+
+(1)`method`, only `topk` method is supported currently. Users are welcome to contribute more methods.
+
+(2)`rp1`, users can expand more groups such as `rp2`, `rp3`, etc.
+
+(3)`related_modules`, as mentioned in “when to use row pruning”, if we do row pruning, the follow-up matrix will be affected. Thus, one needs to know the connection between the modules.
+
+The client code change is the same as [weight quantization](#12-weight-quantization).
+
+One can run our row pruning example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
+
+```shell
+DeepSpeedExamples/model_compression/bert$ pip install -r requirements.txt
+DeepSpeedExamples/model_compression/bert$ bash bash_script/pruning_row.sh
+```
+
+And the final result is:
+
+```shell
+Epoch: 02 | Time: 27m 43s
+Clean the best model, and the accuracy of the clean model is acc/mm-acc:0.8440142638818136/0.8425549227013832
+```
+
+#### 1.4.3 Head Pruning
+**What is head pruning**
+
+Head pruning is designed specifically for networks with multi-head attention, such as transformer-based models (see more in [this blog](https://towardsdatascience.com/transformers-explained-visually-part-3-multi-head-attention-deep-dive-1c1ff1024853)). For example, the BERT-base (BERT-large) model has 12 heads (24 heads).
+
+**When to use head pruning**
+
+Head pruning is beneficial to hardware speedup. Moreover, as stated in [this blog](https://towardsdatascience.com/head-pruning-in-transformer-models-ec222ca9ece7): “Surprising observations are made in the [paper](https://arxiv.org/abs/1905.09418), that even after training models normally (with all heads), many heads can be removed at a test time and it will not significantly affect the BLEU score, in fact, some cases removing few heads led to improving BLEU scores.”.
+
+NOTE: Head pruning is a feature designed for the attention layers (e.g., Multi Head Attention in Transformers). For now, it can only be applied to output matrix of the Transformer (i.e., `attention.output.dense` in BERT). Pruning the output matrix can lead to the pruning of Query/Key/Value matrix as well.
+
+**How to use head pruning**
+
+Head pruning can be enabled and configured using the DeepSpeed config JSON file ([configuration details](/docs/config-json/#head-pruning)). The key configurations we would like to point out are:
+
+(1)`num_heads`: users need to provide the correct number of heads for their models.
+
+(2)`modules`: the module `attention.output.dense` is made specific for Hugging Face BERT model. Currently, we only support this case when Query/Key/Values are separated matrices and followed by `attention.output.dense`. We are happy to assist and welcome contributions on variants of attention models.
+
+(3)`related_modules`: as mentioned in “when to use head pruning”, pruning the attention output matrix can lead to pruning QKV matrices as well. Thus, the input here is [“self.query”, “self.key”, “self.value”].
+
+The client code change is the same as [weight quantization](#12-weight-quantization).
+
+One can run our head pruning example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
+
+```shell
+DeepSpeedExamples/model_compression/bert$ pip install -r requirements.txt
+DeepSpeedExamples/model_compression/bert$ bash bash_script/pruning_head.sh
+```
+
+And the final result is:
+
+```shell
+Clean the best model, and the accuracy of the clean model is acc/mm-acc:0.8397350993377484/0.8377746135069162
+```
+
+#### 1.4.4 Channel Pruning
+**What is channel pruning**
+
+Channel pruning is made specifically for convolutional layers and computer vision. According to wikipedia.org, “The color data of an image is stored in three arrays of values, known as channels.”. For example, an image with three channels passing through ResNet-18 produces 64 channels after the first layer.
+
+**When to use channel pruning**
+
+Channel pruning is a feature designed for two back-to-back CONV2d layers (e.g., residual connection in ResNet). As such, we suggest using channel pruning for the first CONV2d layer. Reducing the number of output channels of this layer can help reduce the number of input channels of the next layer. Channel pruning would also work for other kinds of CONV2d layers.
+
+**How to use channel pruning**
+
+Channel pruning can be enabled and configured using the DeepSpeed config JSON file ([configuration details](/docs/config-json/#channel-pruning)).
+
+One can run our channel pruning example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
+
+```shell
+pip install torch torchvision
+DeepSpeedExamples/model_compression/cifar$ bash run_compress.sh
+```
+
+And the final result is:
+
+```shell
+after_clean
+epoch 10 testing_correct: 0.7664
+```
+
+Note that the above result is when not using batch-norm (BN) in the “ResNet” model. If you use BN for the model and apply channel pruning, the validation after cleaning the model will be different from the model before cleaning. We suggest users to further finetune the model after applying `redundancy_clean` for such cases.
+
+## 2. Tutorial for ZeroQuant: efficient and affordable post-training quantization
+In this section, we introduce how to apply DS-Compression to perform cost-free INT8 quantization and lightweight INT4/INT8 mixed-precision quantization. For more details, please refer to [our paper](https://arxiv.org/abs/2206.01861).
+
+**What is ZeroQuant**
+
+ZeroQuant is an efficient Post Training Quantization method that includes (1) a fine-grained hardware-friendly quantization scheme for both weight and activations, which can significantly reduce the quantization error; (2) a novel affordable layer-by-layer knowledge distillation algorithm (LKD) even without the access to the original training data; (3) a highly-optimized quantization system backend support to remove the quantization/dequantization overhead. By these techniques, ZeroQuant is able to (1) quantize models to INT8 without any cost and (2) quantize models to INT4/INT8 mixed-precision quantization with minimal resource requirements (e.g., 31s for BERT-base quantization).
+
+**When to use ZeroQuant**
+
+When you want to quantize the transformer-based model to INT8 or INT4/INT8 format, it is always a good idea to try ZeroQuant first, especially when the model is very resource-hungry (GPU and/or time) to do quantization aware training and/or when the original training data is not accessible.
+
+**How to use ZeroQuant**
+
+One can run our BERT example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
+
+```shell
+DeepSpeedExamples/model_compression/bert$ pip install -r requirements.txt
+DeepSpeedExamples/model_compression/bert$ bash bash_script/ZeroQuant/zero_quant.sh
+```
+
+And the final result is:
+
+```shell
+Clean the best model, and the accuracy of the clean model is acc/mm-acc:0.8427916454406521/0.8453010577705452
+```
+
+One can run our GPT example by:
+
+```shell
+DeepSpeedExamples/model_compression/gpt2$ pip install -r requirements.txt
+DeepSpeedExamples/model_compression/gpt2$ bash bash_script/run_zero_quant.sh
+```
+
+And the final result is:
+
+```shell
+Before converting the module COVN1D to linear and init_compression: 19.371443732303174
+Before cleaning, Epoch at 0 with Perplexity: 19.47031304212775
+After cleaning with Perplexity: 19.47031304212775
+```
+
+NOTE: right now, we only support zero cost quantization. Stay tuned for the code release on layer-by-layer knowledge distillation proposed in the ZeroQuant paper.
+
+## 3. Tutorial for XTC: simple yet effective compression pipeline for extreme compression
+In this section, we introduce how to apply DeepSpeed Compression library to perform the light-weight layer reduction and ultra-low bit precision (binary/ternary) quantization. In particularly, we will guide you on implementing the [XTC methods](https://arxiv.org/abs/2206.01859), namely:
+
+(1) Obtaining a 1-bit or 2-bit BERT-base (12-layer) with 8-bit activation quantization.
+
+(2) Reducing the 12-layer Bert-base to a 5-layer one and then obtaining its 1-bit or 2-bit counterparts.
+
+**What is XTC**
+
+XTC (short for eXTreme Compression) is our new simple yet efficient method that compresses a model to its limit with lightweight layer reduction and robust binarization. XTC reduces the model size by 32x with almost no loss in the average score on the GLUE tasks via simple yet effective binarization technique. By combining extreme quantization and lightweight layer reduction, we can further improve the binarized model, achieving 50x model size reduction while keeping 97% of the accuracy.
+For more details, see how we derive our method in [our paper](https://arxiv.org/abs/2206.01859) where we perform a systematic study on the impacts of various techniques currently used for extreme compression.
+
+**When to use XTC**
+
+If you want to significantly compress your models while retaining competitive performance, XTC could be a desirable choice. It is a simple and hyper-parameter tuning friendly method.
+
+**How to use XTC**
+
+**Installation:** Examples of XTC extreme compression for BERT models are at `model_compression/bert/bash_script/XTC` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples). You will need to install the requirements by:
+
+```shell
+DeepSpeedExamples/model_compression/bert$ pip install -r requirements.txt
+```
+
+**Implementation of XTC methods:**
+To accommodate users who do not have a fine-tuned model or task-specific model for compression, with the arg `--model_name_or_path yoshitomo-matsubara/bert-base-uncased-${TASK_NAME}` our python script `run_glue_no_trainer.py` automatically downloads the models from Hugging Face. Users can also use their own models with better accuracy as the teacher and the student model initialization.
+
+### 3.1  One-bit or Two-bit BERT-base (12-layer) with 8-bit activation quantization
+For the configurations, see `model_compression/bert/config/XTC/ds_config_W1A8_Qgroup1_fp32.json` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples). In our paper, we used FP32 (`"fp16": {"enabled": false}`) to perform training, while directly applying 8-bit quantization (`"bits": 8`) to the activations and 1-bit quantization (`"start_bits": 1, "target_bits": 1`) to the attention (query, key, val) and feedforward weight matrices (`"modules": ["attention.self", "intermediate", "output.dense"]`) at the beginning of the training (`"schedule_offset": 0`).  In addition, we also apply 1-bit quantization to `word_embeddings` as weight quantization.
+
+One can run this example by:
+
+```shell
+DeepSpeedExamples/model_compression/bert$ bash bash_script/XTC/quant_1bit.sh
+```
+
+And the final result is:
+
+```shell
+Clean the best model, and the accuracy of the clean model is acc/mm-acc:0.8293428425878757/0.8396053702196908
+```
+
+The other important feature we would like to mention is the `quantize_groups` inside `weight_quantization`, which is set to be 1 here to match our XTC paper's FP32 training setup. We find that under FP16 training, smaller number of quantization group (e.g., 1 or 2) could lead to unstable training. Thus, we recommend using larger number of groups (e.g., 64) under FP16. `model_compression/bert/config/ds_config_W1A8_Qgroup64_fp16.json` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) is the FP16 example configurations, where `"fp16": {"enabled": true}` and `"weight_quantization": {"shared_parameters": {"quantize_weight_in_forward": false}}` are different from FP32 case.
+
+With this config, we quantize the existing fined-tuned models downloaded from Hugging Face. For 2-bit weight quantization, user needs to update the ds_config JSON file. To give a sense of the compression performance of downloaded models compared to our paper, we collect the results (1/2-bit BERT on MNLI and QQP with 18 training epochs) in table below. The difference between this tutorial and paper is because they use different checkpoints. Data augmentation introduces in [TinyBERT](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/TinyBERT) will help significantly for smaller tasks (such as mrpc, rte, sst-b and cola). See more details in [our paper](https://arxiv.org/abs/2206.01859).
+
+![XTC quantization results](/assets/images/xtc-1.png){: .align-center}
+
+### 3.2 Compressing the 12-layer BERT-base to 1-bit or 2-bit 6/5-layer BERT
+
+This section consists of two parts: (a) we first perform a light-weight layer reduction, and (b) based on the model in (a), we perform 1-bit or 2-bit quantization.
+
+**3.2.1 Light-weight Layer Reduction**
+
+`model_compression/bert/config/XTC/ds_config_layer_reduction_fp16.json` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) is the example configuration for reducing the 12-layer BERT-base to a 6-layer one. The student’s layers are initialized from i-layer of the teacher with i= [1, 3 ,5 ,7 ,9 ,11] (note that the layer starts from 0), which is called `Skip-BERT_5` in our XTC paper. In addition, student’s modules including embedding, pooler and classifier are also initialized from teacher. For 5-layer layer reduction, one needs to change the configs in `ds_config_layer_reduction_fp16.json` to `"keep_number_layer": 5`, `"teacher_layer": [2, 4 ,6, 8, 10]`(like in `model_compression/bert/config/ds_config_TEMPLATE.json`).
+
+One can run this example by:
+
+```shell
+DeepSpeedExamples/model_compression/bert$ bash bash_script/XTC/layer_reduction.sh
+```
+
+And the final result is:
+
+```shell
+Clean the best model, and the accuracy of the clean model is acc/mm-acc:0.8377992868059093/0.8365541090317331
+```
+
+Notably, when using one-stage knowledge distillation (`--distill_method one_stage`), the difference between the outputs of teacher and student models (att_loss and rep_loss) also need to be consistent with the initialization. See the function `_kd_function` under `forward_loss` in `model_compression/bert/util.py`.
+
+For mnli/qqp, we set `--num_train_epochs 36`, `--learning_rate 5e-5`, and with the JSON config above. The results are given below (we also include the fp16 training results). Using fp32 clearly results in more stable performance than fp16, although fp16 can speed up the training time.
+
+![XTC layer reduction results](/assets/images/xtc-2.png){: .align-center}
+
+**3.2.2 One-bit or Two-bit quantization for 6-layer (5-layer) BERT**
+
+Given the above layer-reduced models ready, we now continue to compress the model with 1/2-bit quantization. `model_compression/bert/config/XTC/ds_config_layer_reduction_W1Q8_fp32.json` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) is the example configuration where we set the layer reduction to be true on top of `model_compression/bert/config/XTC/ds_config_W1A8_Qgroup1_fp32.json`. In addition to the configuration, we need to update the path for the student model using `--pretrained_dir_student` in the script `model_compression/bert/bash_script/XTC/layer_reduction_1bit.sh`. User can train with a different teacher model by adding `--pretrained_dir_teacher`.
+
+One can run this example by:
+
+```shell
+DeepSpeedExamples/model_compression/bert$ bash bash_script/XTC/layer_reduction_1bit.sh
+```
+
+And the final result is:
+
+```shell
+Epoch: 18 | Time: 18m 11s
+Clean the best model, and the accuracy of the clean model is acc/mm-acc:0.8140601120733572/0.8199755899104963
+```
+
+With the command above, one can now obtain the results of 1-bit 6-layer model. Now we list more results for 2-/1-bit 6/5-layer models in the following table. Note that the checkpoints we used for the compression below are from the above table in section 3.2.1.
+
+![XTC 6-layer and quantization](/assets/images/xtc-3.png){: .align-center}
+
+![XTC 5-layer and quantization](/assets/images/xtc-4.png){: .align-center}
diff --git a/docs/_tutorials/monitor.md b/docs/_tutorials/monitor.md
new file mode 100644
index 0000000000000000000000000000000000000000..a9c111f8eeeceb27481964c2639420deb7244b09
--- /dev/null
+++ b/docs/_tutorials/monitor.md
@@ -0,0 +1,105 @@
+---
+title: "Monitor"
+excerpt: "Monitor your model's training metrics live and log for future analysis"
+tags: profiling performance-tuning
+---
+
+In this tutorial, we introduce the DeepSpeed Monitor and provide examples of its usage.
+
+  - [Overview](#overview)
+  - [Usage](#usage)
+
+## Overview
+
+Monitoring model and system metrics during training is vital to ensure hardware resources are fully utilized. The DeepSpeed Monitor enables live logging of metrics through one or more monitoring backends such as PyTorch's [TensorBoard](https://pytorch.org/docs/1.8.0/tensorboard.html), [WandB](https://docs.wandb.ai/quickstart), and simple CSV files.
+
+Below is a live monitoring view for TensorBoard:
+
+![TensorBoard Example Output](/assets/images/tensorboard_monitor.PNG){: .align-center}
+
+Below is a live monitoring view for WandB:
+
+![WandB Example Output](/assets/images/wandb_monitor.PNG){: .align-center}
+
+## Usage
+
+The DeepSpeed Monitor is configured within the deepspeed [configuration file](/docs/config-json/#monitoring-module-tensorboard-wandb-csv). DeepSpeed will automatically monitor key training metrics, including those tracked with the `wall_clock_breakdown` configuration option. In addition, users can log their own custom events and metrics.
+
+  - [Automatic Monitoring](#automatic-monitoring)
+  - [Custom Monitoring](#custom-monitoring)
+
+### Automatic Monitoring
+
+When using DeepSpeed for model training, the Monitor can be configured in the DeepSpeed [configuration file](/docs/config-json/#monitoring-module-tensorboard-wandb-csv). No explicit API calls are needed to use the Monitor. The Monitor can be enabled by adding the following field to DeepSpeed's configuration json file. Refer to [Monitoring](/docs/config-json/#monitoring-module-tensorboard-wandb-csv) for details.
+
+```json
+{
+  "tensorboard": {
+    "enabled": true,
+    "output_path": "output/ds_logs/",
+    "job_name": "train_bert"
+  }
+  "wandb": {
+    "enabled": true,
+    "team": "my_team",
+    "group": "my_group",
+    "project": "my_project"
+  }
+  "csv_monitor": {
+    "enabled": true,
+    "output_path": "output/ds_logs/",
+    "job_name": "train_bert"
+  }
+}
+```
+
+DeepSpeed will automatically log to all available and enabled monitoring backends listed in the config, and will generate live monitoring views such as those listed above.
+
+### Custom Monitoring
+
+In addition to automatic monitoring, users can log their own custom metrics in client scripts. Currently, there are two ways to initialize Monitor objects:
+
+1. (Recommended) - Create a `MonitorMaster(ds_config.monitor_config)` object, which automatically initializes all monitor backends present in the DeepSpeed configuration
+2. Create a specific `TensorBoardMonitor(ds_config.monitor_config)`, `WandbMonitor(ds_config.monitor_config)`, `csvMonitor(ds_config.monitor_config)` object which will only initialize a specific monitor backend present in the DeepSpeed configuration
+
+
+The steps to create a custom monitor are as follows:
+
+1. Add import to your desired Monitor
+2. Initialize monitor with DeepSpeed config's `monitor_config`
+3. Create a list of one or more 3-tuples in the format `[("label", value, ds_engine.global_samples), ...]`\*
+4. Call `monitor.write_events` on the list from step 3
+
+\* Note - Some Monitor backends don't support mixed sample values. Be sure to use your DeepSpeed engine object's `global_samples` attribute in each 3-tuple
+
+For example usage, see the following modified [DeepSpeedExamples/cifar](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar) example:
+
+```python
+# Step 1: Import monitor (and DeepSpeed config, if needed)
+from deepspeed.monitor.monitor import MonitorMaster
+from deepspeed.runtime.config import DeepSpeedConfig
+
+# Step 2: Initialized monitor with DeepSpeed config (get DeepSpeed config object, if needed)
+ds_config = DeepSpeedConfig("ds_config.json")
+monitor = MonitorMaster(ds_config.monitor_config)
+
+for epoch in range(2):
+
+    running_loss = 0.0
+    for i, data in enumerate(trainloader):
+        pre = time.time()
+        inputs, labels = data[0].to(model_engine.local_rank), data[1].to(
+            model_engine.local_rank)
+        if fp16:
+            inputs = inputs.half()
+        outputs = model_engine(inputs)
+        loss = criterion(outputs, labels)
+
+        model_engine.backward(loss)
+        model_engine.step()
+        post = time.time()
+        # Step 3: Create list of 3-tuple records (single entry in this case)
+        events = [("Time per step", post-pre, model_engine.global_samples)]
+        # Step 4: Call monitor.write_events on the list from step 3
+        monitor.write_events(events)
+```
diff --git a/docs/_tutorials/progressive_layer_dropping.md b/docs/_tutorials/progressive_layer_dropping.md
old mode 100644
new mode 100755
diff --git a/docs/_tutorials/transformer_kernel.md b/docs/_tutorials/transformer_kernel.md
old mode 100644
new mode 100755
diff --git a/docs/_tutorials/zero-offload.md b/docs/_tutorials/zero-offload.md
index 420760f73391f2d197c5e271383dc7f665411fd9..764e80b7dd4fcaa66f196448c9b629dbde93d9b5 100644
--- a/docs/_tutorials/zero-offload.md
+++ b/docs/_tutorials/zero-offload.md
@@ -38,20 +38,22 @@ Second, we need to apply the following changes to ensure that only one GPU is us
 ```
 
 ### DeepSpeed Configuration Changes
-ZeRO-Offload leverages many ZeRO stage 2 mechanisms, and so the configuration changes to enable ZeRO-Offload are an extension of those required to enable ZeRO stage 2. The `zero_optimization` configuration to enable ZeRO-Offload is shown below:
+ZeRO-Offload leverages many ZeRO stage 1 and 2 mechanisms, and so the configuration changes to enable ZeRO-Offload are an extension of those required to enable ZeRO stage 1 or 2. The `zero_optimization` configuration to enable ZeRO-Offload is shown below:
 
 ```json
 {
     "zero_optimization": {
         "stage": 2,
-        "cpu_offload": true,
+        "offload_optimizer": {
+            "device": "cpu",
+        }
         "contiguous_gradients": true,
         "overlap_comm": true
     }
 }
 ```
 
-As seen above, in addition to setting the _stage_ field to **2** (to enable ZeRO stage 2), we also need to set _cpu_offload_ flag to **true** to enable ZeRO-Offload optimizations. In addition, we can  set other ZeRO stage 2 optimization flags, such as _overlap_comm_ to tune ZeRO-Offload performance.  With these changes we can now run the model. We share some screenshots of the training below.
+As seen above, in addition to setting the _stage_ field to **2** (to enable ZeRO stage 2, but stage 1 also works), we also need to set the _offload\_optimizer_ device to **cpu** to enable ZeRO-Offload optimizations. In addition, we can set other ZeRO stage 2 optimization flags, such as _overlap\_comm_ to tune ZeRO-Offload performance.  With these changes we can now run the model. We share some screenshots of the training below.
 
 Here is a screenshot of the training log:
 
diff --git a/docs/assets/images/175b-trend.png b/docs/assets/images/175b-trend.png
new file mode 100755
index 0000000000000000000000000000000000000000..6a500d53fb61d0e2b384e14294bbd2575aace848
Binary files /dev/null and b/docs/assets/images/175b-trend.png differ
diff --git a/docs/assets/images/1t-trend.png b/docs/assets/images/1t-trend.png
new file mode 100755
index 0000000000000000000000000000000000000000..7164eb0819ad2dfec2090af00ec1f0a6def9bb78
Binary files /dev/null and b/docs/assets/images/1t-trend.png differ
diff --git a/docs/assets/images/3pillars.png b/docs/assets/images/3pillars.png
new file mode 100755
index 0000000000000000000000000000000000000000..c2943ca912a18d202cb0d7f876349ecd7d82a872
Binary files /dev/null and b/docs/assets/images/3pillars.png differ
diff --git a/docs/assets/images/530b-trend.png b/docs/assets/images/530b-trend.png
new file mode 100755
index 0000000000000000000000000000000000000000..dc29b8aad02d94ad43f5b830e7afc1bd331c2cca
Binary files /dev/null and b/docs/assets/images/530b-trend.png differ
diff --git a/docs/assets/images/DeepSpeed_dark_transparent.svg b/docs/assets/images/DeepSpeed_dark_transparent.svg
old mode 100644
new mode 100755
diff --git a/docs/assets/images/DeepSpeed_light_transparent.svg b/docs/assets/images/DeepSpeed_light_transparent.svg
old mode 100644
new mode 100755
diff --git a/docs/assets/images/accelerate-dark.png b/docs/assets/images/accelerate-dark.png
new file mode 100755
index 0000000000000000000000000000000000000000..37f870cc3f828f78d07b6dc819deac8c19027bf8
Binary files /dev/null and b/docs/assets/images/accelerate-dark.png differ
diff --git a/docs/assets/images/accelerate-light.png b/docs/assets/images/accelerate-light.png
new file mode 100755
index 0000000000000000000000000000000000000000..d60173cf582a4f3b01bb0575d1dced41f9414a53
Binary files /dev/null and b/docs/assets/images/accelerate-light.png differ
diff --git a/docs/assets/images/accelerate.png b/docs/assets/images/accelerate.png
new file mode 100755
index 0000000000000000000000000000000000000000..9e9111ac178c8a4f117c5e84063a74a01c23becd
Binary files /dev/null and b/docs/assets/images/accelerate.png differ
diff --git a/docs/assets/images/adam-convergence.png b/docs/assets/images/adam-convergence.png
old mode 100644
new mode 100755
diff --git a/docs/assets/images/bert-ib.png b/docs/assets/images/bert-ib.png
old mode 100644
new mode 100755
diff --git a/docs/assets/images/bert-scaling.png b/docs/assets/images/bert-scaling.png
old mode 100644
new mode 100755
diff --git a/docs/assets/images/bert-tcp.png b/docs/assets/images/bert-tcp.png
old mode 100644
new mode 100755
diff --git a/docs/assets/images/bingbert-mixedbit.png b/docs/assets/images/bingbert-mixedbit.png
old mode 100644
new mode 100755
diff --git a/docs/assets/images/convergence-table.png b/docs/assets/images/convergence-table.png
old mode 100644
new mode 100755
diff --git a/docs/assets/images/data_efficiency/data_efficiecy_fig0.png b/docs/assets/images/data_efficiency/data_efficiecy_fig0.png
new file mode 100644
index 0000000000000000000000000000000000000000..1a32eaef49ee8378352b61733f4fb1d0f85e6b61
Binary files /dev/null and b/docs/assets/images/data_efficiency/data_efficiecy_fig0.png differ
diff --git a/docs/assets/images/data_efficiency/data_efficiecy_fig1.png b/docs/assets/images/data_efficiency/data_efficiecy_fig1.png
new file mode 100644
index 0000000000000000000000000000000000000000..95220633e50d8f6dcdc58dccfd11cf96bb8ce272
Binary files /dev/null and b/docs/assets/images/data_efficiency/data_efficiecy_fig1.png differ
diff --git a/docs/assets/images/data_efficiency/data_efficiecy_fig2.png b/docs/assets/images/data_efficiency/data_efficiecy_fig2.png
new file mode 100644
index 0000000000000000000000000000000000000000..07c088f0ce070106a2ffad00e9e20ce145f0076b
Binary files /dev/null and b/docs/assets/images/data_efficiency/data_efficiecy_fig2.png differ
diff --git a/docs/assets/images/data_efficiency/data_efficiecy_fig3.png b/docs/assets/images/data_efficiency/data_efficiecy_fig3.png
new file mode 100644
index 0000000000000000000000000000000000000000..52a44e7a23491a0846c54fcde2613d52b80451f2
Binary files /dev/null and b/docs/assets/images/data_efficiency/data_efficiecy_fig3.png differ
diff --git a/docs/assets/images/determined.svg b/docs/assets/images/determined.svg
new file mode 100644
index 0000000000000000000000000000000000000000..8163db12ad44542b5be247a8f6857d4e133c9dc5
--- /dev/null
+++ b/docs/assets/images/determined.svg
@@ -0,0 +1,22 @@
+<svg width="240" height="38" viewBox="0 0 240 38" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g clip-path="url(#clip0)">
+<path d="M85.275 17.55C85.35 22.125 82.575 26.725 76.3 26.725C74.1 26.725 71.45 26.725 69.25 26.725V8.72498C71.45 8.72498 74.1 8.72498 76.3 8.72498C82.45 8.72498 85.2 13.1 85.275 17.55ZM72.6 23.45H76.3C80.375 23.45 82 20.475 81.925 17.525C81.85 14.725 80.2 11.925 76.3 11.925H72.6V23.45Z" fill="#38546B"/>
+<path d="M90.3 21.5C90.5 23.075 91.875 24.2 94.075 24.2C95.225 24.2 96.75 23.775 97.475 23.025L99.475 25C98.15 26.375 95.95 27.05 94.025 27.05C89.65 27.05 87.075 24.35 87.075 20.3C87.075 16.45 89.7 13.675 93.8 13.675C98.025 13.675 100.675 16.3 100.2 21.5H90.3ZM97.175 18.925C96.975 17.275 95.675 16.45 93.9 16.45C92.2 16.45 90.825 17.275 90.35 18.925H97.175Z" fill="#38546B"/>
+<path d="M106.275 10.5V14.1H109.775V16.8H106.25V22.25C106.25 23.45 106.925 24.05 107.9 24.05C108.4 24.05 108.95 23.9 109.425 23.675L110.3 26.35C109.4 26.7 108.65 26.875 107.7 26.9C104.95 27 103.15 25.425 103.15 22.275V16.775H100.8V14.075H103.15V10.825L106.275 10.5Z" fill="#38546B"/>
+<path d="M114.6 21.5C114.8 23.075 116.175 24.2 118.375 24.2C119.525 24.2 121.05 23.775 121.775 23.025L123.775 25C122.45 26.375 120.25 27.05 118.325 27.05C113.95 27.05 111.375 24.35 111.375 20.3C111.375 16.45 114 13.675 118.1 13.675C122.325 13.675 124.975 16.3 124.5 21.5H114.6ZM121.475 18.925C121.275 17.275 119.975 16.45 118.2 16.45C116.5 16.45 115.125 17.275 114.65 18.925H121.475Z" fill="#38546B"/>
+<path d="M129.6 14.05L129.825 15.525C130.8 13.95 132.1 13.725 133.4 13.725C134.7 13.725 135.975 14.25 136.65 14.925L135.25 17.65C134.6 17.1 134.025 16.825 133 16.825C131.35 16.825 129.85 17.7 129.85 20.025V26.7H126.725V14.05H129.6Z" fill="#38546B"/>
+<path d="M146.075 26.725V19.95C146.075 18.275 145.2 16.775 143.475 16.775C141.775 16.775 140.775 18.3 140.775 19.95V26.725H137.65V14.025H140.55L140.775 15.575C141.45 14.3 142.9 13.825 144.125 13.825C145.65 13.825 147.15 14.45 147.875 16.2C149 14.4 150.475 13.9 152.1 13.9C155.7 13.9 157.475 16.1 157.475 19.9V26.725H154.35V19.9C154.35 18.225 153.65 16.825 151.95 16.825C150.25 16.825 149.2 18.3 149.2 19.95V26.725H146.075Z" fill="#38546B"/>
+<path d="M163.875 10.5C163.875 12.95 160.175 12.95 160.175 10.5C160.175 8.04998 163.875 8.04998 163.875 10.5ZM160.45 14V26.725H163.575V14H160.45Z" fill="#38546B"/>
+<path d="M175.55 26.725V20.1C175.55 18.175 174.5 16.7 172.5 16.7C170.575 16.7 169.275 18.325 169.275 20.25V26.725H166.175V14.025H168.975L169.175 15.75C170.45 14.5 171.75 13.85 173.325 13.85C176.3 13.85 178.7 16.075 178.7 20.075V26.725H175.55Z" fill="#38546B"/>
+<path d="M183.85 21.5C184.05 23.075 185.425 24.2 187.625 24.2C188.775 24.2 190.3 23.775 191.025 23.025L193.025 25C191.7 26.375 189.5 27.05 187.575 27.05C183.2 27.05 180.625 24.35 180.625 20.3C180.625 16.45 183.25 13.675 187.35 13.675C191.575 13.675 194.225 16.3 193.75 21.5H183.85ZM190.725 18.925C190.525 17.275 189.225 16.45 187.45 16.45C185.75 16.45 184.375 17.275 183.9 18.925H190.725Z" fill="#38546B"/>
+<path d="M209.1 8.77502V26.725H206.175L205.975 24.975C205 26.5 203.425 27 201.9 27C198.175 27 195.375 24.525 195.375 20.375C195.375 16 198.125 13.75 201.825 13.75C203.15 13.75 205.25 14.475 205.975 15.775V8.77502H209.1ZM198.5 20.375C198.5 22.575 200.1 24.1 202.1 24.1C204.075 24.1 205.775 22.65 205.775 20.375C205.775 18.175 204.075 16.675 202.1 16.675C200.1 16.675 198.5 18.1 198.5 20.375Z" fill="#38546B"/>
+<path d="M230.35 23.325H220.95L219.4 26.725H215.725L223.8 8.72498H227.5L235.575 26.725H231.875L230.35 23.325ZM225.65 12.5L222.3 20.175H228.975L225.65 12.5Z" fill="#F67B21"/>
+<path d="M240.025 8.72498H236.65V26.725H240.025V8.72498Z" fill="#F67B21"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M13.15 3.60002C15.1 3.50002 16.775 4.97503 16.875 6.92502C16.975 8.87502 15.5 10.55 13.55 10.675C11.6 10.775 9.925 9.30002 9.825 7.35002C9.7 5.37502 11.2 3.70002 13.15 3.60002ZM19.85 0.725025C20.925 0.675025 21.825 1.47502 21.9 2.55002C21.95 3.62502 21.15 4.55002 20.075 4.60002C19 4.65002 18.1 3.85002 18.025 2.77502C17.95 1.72502 18.775 0.800025 19.85 0.725025ZM44.75 11.175C45.825 11.125 46.725 11.925 46.8 13C46.85 14.075 46.05 15 44.975 15.05C43.9 15.1 43 14.3 42.925 13.225C42.875 12.15 43.675 11.25 44.75 11.175ZM2.95 17.125C4.55 17.025 5.95 18.275 6.025 19.875C6.125 21.5 4.875 22.875 3.275 22.95C1.675 23.05 0.274999 21.8 0.199999 20.2C0.124999 18.6 1.35 17.225 2.95 17.125ZM10.275 23.775C11.35 23.725 12.25 24.525 12.325 25.6C12.375 26.675 11.575 27.6 10.5 27.65C9.425 27.7 8.525 26.9 8.45 25.825C8.375 24.75 9.2 23.85 10.275 23.775ZM34.8 31.725C36.45 31.65 37.875 32.925 37.95 34.6C38.05 36.275 36.75 37.725 35.075 37.825C33.4 37.925 31.95 36.625 31.85 34.95C31.8 34.2 32.05 33.475 32.475 32.925C32.125 31.525 31.475 30.275 30.625 29.125C30.5 29.15 30.375 29.15 30.25 29.175C28.175 29.3 26.425 27.7 26.3 25.65C26.275 25 26.4 24.4 26.65 23.85C25.625 22.5 24.375 21.35 22.775 20.45L22.75 20.425C22.35 20.6 21.875 20.725 21.4 20.75C19.325 20.875 17.525 19.275 17.425 17.175C17.3 15.075 18.9 13.3 20.975 13.175C23.05 13.05 24.85 14.65 24.95 16.75C24.975 17.325 24.875 17.875 24.7 18.375C25.575 19.875 26.825 21.1 28.3 22.1C28.75 21.85 29.275 21.725 29.825 21.675C30.625 21.625 31.375 21.85 32 22.225C33.75 21.25 35.325 19.95 36.6 18.125C36.575 18.025 36.575 17.925 36.575 17.825C36.475 16.25 37.675 14.925 39.25 14.825C40.825 14.725 42.15 15.925 42.25 17.5C42.35 19.075 41.15 20.4 39.575 20.5C39.175 20.525 38.775 20.45 38.425 20.325L38.4 20.35C36.525 21.275 34.95 22.6 33.575 24.125C33.7 24.475 33.775 24.825 33.8 25.225C33.875 26.325 33.425 27.375 32.7 28.1C33.1 29.35 33.825 30.575 34.8 31.725ZM18.525 22.8C20.275 22.7 21.75 24.025 21.85 25.775C21.95 27.525 20.625 29.025 18.875 29.1C17.125 29.2 15.65 27.875 15.55 26.125C15.525 25.7 15.575 25.3 15.725 24.9C15.175 23.45 14.05 22.425 12.6 21.675C12.275 21.8 11.925 21.875 11.575 21.9C9.65 22 7.975 20.525 7.875 18.575C7.775 16.65 9.25 15 11.175 14.875C13.1 14.775 14.775 16.25 14.875 18.2C14.9 18.75 14.8 19.25 14.625 19.725C15.275 21.125 16.3 22.2 17.725 22.925C17.75 22.925 17.75 22.95 17.775 22.95C18.025 22.875 18.275 22.825 18.525 22.8ZM39.225 23.9C39.575 23.875 39.925 23.925 40.225 24.025L40.25 24C42.2 23.275 43.575 22.175 44.3 20.65L44.325 20.625C44.35 19.5 45.225 18.55 46.375 18.475C47.575 18.4 48.6 19.325 48.675 20.525C48.75 21.725 47.825 22.75 46.625 22.825C46.45 22.825 46.3 22.825 46.125 22.8C44.425 23.5 43.05 24.625 42.05 26.2C42.05 26.275 42.075 26.35 42.075 26.425C42.15 27.9 41.025 29.175 39.55 29.275C38.075 29.35 36.8 28.225 36.7 26.725C36.625 25.25 37.75 24 39.225 23.9ZM22.125 6.57502C22.575 6.55002 22.975 6.62502 23.35 6.80002H23.375C24.7 6.42502 25.825 5.70002 26.625 4.52502C26.6 4.37502 26.575 4.20002 26.575 4.02502C26.475 2.07502 27.95 0.425025 29.875 0.300025C31.8 0.200025 33.475 1.67502 33.575 3.62502C33.675 5.57502 32.2 7.22502 30.275 7.35002C29.525 7.40002 28.825 7.20002 28.25 6.82502C26.95 7.05002 25.85 7.77502 25 9.12502V9.15002C25.075 10.65 23.95 11.925 22.45 12.025C20.95 12.1 19.675 10.975 19.6 9.47503C19.475 7.95003 20.625 6.67502 22.125 6.57502ZM38.575 5.70002C40.075 5.62502 41.35 6.75002 41.425 8.25002C41.5 9.75002 40.375 11.025 38.875 11.125C38.425 11.15 38.025 11.075 37.625 10.9L37.6 10.925C36.125 11.275 34.975 12.125 34.05 13.375C34.075 13.525 34.1 13.7 34.1 13.85C34.2 15.775 32.75 17.4 30.825 17.525C28.925 17.625 27.275 16.175 27.175 14.25C27.075 12.325 28.525 10.7 30.45 10.575C31.175 10.525 31.85 10.725 32.425 11.075C33.85 10.625 35.05 9.80002 35.975 8.62502V8.57502C35.95 7.05002 37.075 5.77502 38.575 5.70002ZM5.1 8.22503C6.7 8.12503 8.1 9.37502 8.175 10.975C8.275 12.6 7.025 13.975 5.425 14.05C3.825 14.15 2.425 12.9 2.35 11.3C2.25 9.70002 3.475 8.30002 5.1 8.22503Z" fill="#F67B21"/>
+</g>
+<defs>
+<clipPath id="clip0">
+<path d="M0 0H240V38H0V0Z" fill="white"/>
+</clipPath>
+</defs>
+</svg>
diff --git a/docs/assets/images/gpu-numbers.png b/docs/assets/images/gpu-numbers.png
old mode 100644
new mode 100755
diff --git a/docs/assets/images/hf-logo.png b/docs/assets/images/hf-logo.png
new file mode 100755
index 0000000000000000000000000000000000000000..7708a9f4d9413a7a80f48feb59e15ce6e673a24e
Binary files /dev/null and b/docs/assets/images/hf-logo.png differ
diff --git a/docs/assets/images/hf-transformers.png b/docs/assets/images/hf-transformers.png
new file mode 100755
index 0000000000000000000000000000000000000000..70d7c48942cb60b2dc8ae2e3b06d92efed2f6538
Binary files /dev/null and b/docs/assets/images/hf-transformers.png differ
diff --git a/docs/assets/images/inference-gemm-scheduling.png b/docs/assets/images/inference-gemm-scheduling.png
old mode 100644
new mode 100755
diff --git a/docs/assets/images/inference-kernel-fusion.png b/docs/assets/images/inference-kernel-fusion.png
old mode 100644
new mode 100755
diff --git a/docs/assets/images/inference-latency.png b/docs/assets/images/inference-latency.png
old mode 100644
new mode 100755
diff --git a/docs/assets/images/inference-throughput.png b/docs/assets/images/inference-throughput.png
old mode 100644
new mode 100755
diff --git a/docs/assets/images/large-model-graph.png b/docs/assets/images/large-model-graph.png
new file mode 100755
index 0000000000000000000000000000000000000000..1e82c2d2d455c5a2ecf5569fe2f7d96b9c9b77d6
Binary files /dev/null and b/docs/assets/images/large-model-graph.png differ
diff --git a/docs/assets/images/lightning-dark.png b/docs/assets/images/lightning-dark.png
new file mode 100755
index 0000000000000000000000000000000000000000..d1c929b971a5a17bf5afa300d49858d280d7cac7
Binary files /dev/null and b/docs/assets/images/lightning-dark.png differ
diff --git a/docs/assets/images/lightning-dark.svg b/docs/assets/images/lightning-dark.svg
new file mode 100755
index 0000000000000000000000000000000000000000..23f34ecbd4c4182f95dd9eca847f20635636415f
--- /dev/null
+++ b/docs/assets/images/lightning-dark.svg
@@ -0,0 +1,10 @@
+<svg width="732" height="198" viewBox="0 0 732 198" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path d="M80.7967 1.017L3.69127 46.7244C2.56854 47.3909 1.63645 48.3484 0.988564 49.5007C0.340673 50.653 -0.000172384 51.9595 6.54048e-08 53.2894V144.717C0.000431188 146.046 0.341535 147.353 0.989426 148.504C1.63723 149.657 2.56897 150.614 3.69127 151.282L80.7967 196.983C81.9228 197.649 83.1997 198 84.5 198C85.8003 198 87.0772 197.649 88.2033 196.983L165.309 151.282C166.431 150.614 167.363 149.657 168.011 148.504C168.659 147.353 169 146.046 169 144.717V53.2894C169 51.9595 168.659 50.653 168.011 49.5007C167.363 48.3484 166.431 47.3909 165.309 46.7244L88.2033 1.017C87.0772 0.350663 85.8003 0 84.5 0C83.1997 0 81.9228 0.350663 80.7967 1.017ZM68.229 153.423L77.3848 113.44C77.4503 113.151 77.4417 112.849 77.3598 112.565C77.2778 112.28 77.1252 112.022 76.9174 111.816L54.7265 89.6312C54.5627 89.4704 54.4322 89.2768 54.3432 89.0629C54.2541 88.849 54.2083 88.6191 54.2083 88.3869C54.2083 88.1543 54.2541 87.9241 54.3432 87.7102C54.4322 87.4963 54.5627 87.3032 54.7265 87.1423L98.1183 42.9408C98.3752 42.6755 98.7107 42.5039 99.0728 42.4519C99.4349 42.3999 99.8031 42.4706 100.122 42.6531C100.441 42.8355 100.693 43.1199 100.84 43.4629C100.987 43.8057 101.02 44.1886 100.935 44.5527L91.767 84.6282C91.6989 84.9176 91.7049 85.2203 91.786 85.5064C91.867 85.7924 92.0196 86.0516 92.2292 86.2583L114.291 108.318C114.45 108.479 114.576 108.671 114.662 108.882C114.748 109.093 114.792 109.319 114.792 109.548C114.792 109.776 114.748 110.002 114.662 110.214C114.576 110.425 114.45 110.616 114.291 110.777L71.0521 155.015C70.794 155.275 70.461 155.443 70.1024 155.495C69.7438 155.545 69.3788 155.475 69.0617 155.297C68.7446 155.118 68.4923 154.839 68.3422 154.501C68.1922 154.163 68.1525 153.785 68.229 153.423Z" fill="url(#paint0_linear_5_35)"/>
+<path d="M251.524 141H206.939V60.4775H220.303V129.713H251.524V141ZM268.313 71.4272C266.18 71.4272 264.345 70.7347 262.811 69.3496C261.313 67.9645 260.564 66.2051 260.564 64.0713C260.564 61.9375 261.313 60.1593 262.811 58.7368C264.345 57.3143 266.18 56.603 268.313 56.603C270.522 56.603 272.394 57.3143 273.929 58.7368C275.464 60.1593 276.231 61.9375 276.231 64.0713C276.231 66.0928 275.464 67.8335 273.929 69.2935C272.394 70.716 270.522 71.4272 268.313 71.4272ZM274.771 141H261.744V83.5H274.771V141ZM344.175 136.396C344.175 157.509 333.562 168.065 312.337 168.065C304.85 168.065 298.318 166.811 292.74 164.303V152.399C299.029 155.993 305 157.79 310.652 157.79C324.316 157.79 331.148 151.07 331.148 137.631V131.342H330.923C326.618 138.679 320.142 142.348 311.495 142.348C304.494 142.348 298.842 139.802 294.537 134.711C290.269 129.582 288.135 122.713 288.135 114.103C288.135 104.333 290.438 96.5648 295.042 90.7998C299.646 85.0348 305.973 82.1523 314.021 82.1523C321.621 82.1523 327.255 85.2594 330.923 91.4736H331.148V83.5H344.175V136.396ZM331.26 114.665V107.196C331.26 103.153 329.913 99.7093 327.217 96.8643C324.559 93.9818 321.228 92.5405 317.222 92.5405C312.281 92.5405 308.406 94.3748 305.599 98.0435C302.828 101.675 301.443 106.766 301.443 113.317C301.443 118.97 302.772 123.499 305.43 126.906C308.125 130.275 311.682 131.959 316.099 131.959C320.591 131.959 324.241 130.35 327.049 127.13C329.856 123.874 331.26 119.718 331.26 114.665ZM412.232 141H399.205V109.555C399.205 98.1745 395.405 92.4844 387.806 92.4844C383.987 92.4844 380.768 94.1315 378.147 97.4258C375.527 100.72 374.217 104.931 374.217 110.06V141H361.133V55.873H374.217V93.0459H374.441C378.784 85.7835 384.998 82.1523 393.084 82.1523C405.849 82.1523 412.232 89.9575 412.232 105.568V141ZM458.389 140.382C455.844 141.655 452.493 142.292 448.338 142.292C437.182 142.292 431.604 136.938 431.604 126.232V93.7197H422.002V83.5H431.604V70.1919L444.632 66.4858V83.5H458.389V93.7197H444.632V122.47C444.632 125.876 445.25 128.31 446.485 129.77C447.72 131.229 449.779 131.959 452.662 131.959C454.87 131.959 456.779 131.323 458.389 130.05V140.382ZM520.831 141H507.803V108.6C507.803 97.8563 504.004 92.4844 496.404 92.4844C492.436 92.4844 489.161 93.9818 486.578 96.9766C483.995 99.9339 482.703 103.677 482.703 108.207V141H469.62V83.5H482.703V93.0459H482.928C487.233 85.7835 493.447 82.1523 501.57 82.1523C507.822 82.1523 512.595 84.1925 515.889 88.2729C519.183 92.3159 520.831 98.1745 520.831 105.849V141ZM543.348 71.4272C541.214 71.4272 539.38 70.7347 537.845 69.3496C536.347 67.9645 535.599 66.2051 535.599 64.0713C535.599 61.9375 536.347 60.1593 537.845 58.7368C539.38 57.3143 541.214 56.603 543.348 56.603C545.556 56.603 547.428 57.3143 548.963 58.7368C550.498 60.1593 551.265 61.9375 551.265 64.0713C551.265 66.0928 550.498 67.8335 548.963 69.2935C547.428 70.716 545.556 71.4272 543.348 71.4272ZM549.805 141H536.778V83.5H549.805V141ZM618.086 141H605.059V108.6C605.059 97.8563 601.259 92.4844 593.66 92.4844C589.692 92.4844 586.417 93.9818 583.833 96.9766C581.25 99.9339 579.959 103.677 579.959 108.207V141H566.875V83.5H579.959V93.0459H580.184C584.489 85.7835 590.703 82.1523 598.826 82.1523C605.078 82.1523 609.851 84.1925 613.145 88.2729C616.439 92.3159 618.086 98.1745 618.086 105.849V141ZM686.368 136.396C686.368 157.509 675.755 168.065 654.529 168.065C647.042 168.065 640.51 166.811 634.932 164.303V152.399C641.221 155.993 647.192 157.79 652.845 157.79C666.508 157.79 673.34 151.07 673.34 137.631V131.342H673.116C668.811 138.679 662.334 142.348 653.687 142.348C646.687 142.348 641.034 139.802 636.729 134.711C632.461 129.582 630.328 122.713 630.328 114.103C630.328 104.333 632.63 96.5648 637.234 90.7998C641.839 85.0348 648.165 82.1523 656.214 82.1523C663.813 82.1523 669.447 85.2594 673.116 91.4736H673.34V83.5H686.368V136.396ZM673.453 114.665V107.196C673.453 103.153 672.105 99.7093 669.41 96.8643C666.752 93.9818 663.42 92.5405 659.415 92.5405C654.473 92.5405 650.599 94.3748 647.791 98.0435C645.021 101.675 643.636 106.766 643.636 113.317C643.636 118.97 644.965 123.499 647.623 126.906C650.318 130.275 653.874 131.959 658.292 131.959C662.784 131.959 666.434 130.35 669.241 127.13C672.049 123.874 673.453 119.718 673.453 114.665Z" fill="white"/>
+<defs>
+<linearGradient id="paint0_linear_5_35" x1="127.442" y1="25.514" x2="-40.088" y2="307.246" gradientUnits="userSpaceOnUse">
+<stop stop-color="#792EE5"/>
+<stop offset="1" stop-color="#3EABB3"/>
+</linearGradient>
+</defs>
+</svg>
diff --git a/docs/assets/images/lightning-light.svg b/docs/assets/images/lightning-light.svg
new file mode 100755
index 0000000000000000000000000000000000000000..9c89331b7917ed2e2d523cde1ce256b3298e0db3
--- /dev/null
+++ b/docs/assets/images/lightning-light.svg
@@ -0,0 +1,10 @@
+<svg width="732" height="198" viewBox="0 0 732 198" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path d="M80.7967 1.017L3.69127 46.7244C2.56854 47.3909 1.63645 48.3484 0.988564 49.5007C0.340673 50.653 -0.000172384 51.9595 6.54048e-08 53.2894V144.717C0.000431188 146.046 0.341535 147.353 0.989426 148.504C1.63723 149.657 2.56897 150.614 3.69127 151.282L80.7967 196.983C81.9228 197.649 83.1997 198 84.5 198C85.8003 198 87.0772 197.649 88.2033 196.983L165.309 151.282C166.431 150.614 167.363 149.657 168.011 148.504C168.659 147.353 169 146.046 169 144.717V53.2894C169 51.9595 168.659 50.653 168.011 49.5007C167.363 48.3484 166.431 47.3909 165.309 46.7244L88.2033 1.017C87.0772 0.350663 85.8003 0 84.5 0C83.1997 0 81.9228 0.350663 80.7967 1.017ZM68.229 153.423L77.3848 113.44C77.4503 113.151 77.4417 112.849 77.3598 112.565C77.2778 112.28 77.1252 112.022 76.9174 111.816L54.7265 89.6312C54.5627 89.4704 54.4322 89.2768 54.3432 89.0629C54.2541 88.849 54.2083 88.6191 54.2083 88.3869C54.2083 88.1543 54.2541 87.9241 54.3432 87.7102C54.4322 87.4963 54.5627 87.3032 54.7265 87.1423L98.1183 42.9408C98.3752 42.6755 98.7107 42.5039 99.0728 42.4519C99.4349 42.3999 99.8031 42.4706 100.122 42.6531C100.441 42.8355 100.693 43.1199 100.84 43.4629C100.987 43.8057 101.02 44.1886 100.935 44.5527L91.767 84.6282C91.6989 84.9176 91.7049 85.2203 91.786 85.5064C91.867 85.7924 92.0196 86.0516 92.2292 86.2583L114.291 108.318C114.45 108.479 114.576 108.671 114.662 108.882C114.748 109.093 114.792 109.319 114.792 109.548C114.792 109.776 114.748 110.002 114.662 110.214C114.576 110.425 114.45 110.616 114.291 110.777L71.0521 155.015C70.794 155.275 70.461 155.443 70.1024 155.495C69.7438 155.545 69.3788 155.475 69.0617 155.297C68.7446 155.118 68.4923 154.839 68.3422 154.501C68.1922 154.163 68.1525 153.785 68.229 153.423Z" fill="url(#paint0_linear_5_36)"/>
+<path d="M251.524 141H206.939V60.4775H220.303V129.713H251.524V141ZM268.313 71.4272C266.18 71.4272 264.345 70.7347 262.811 69.3496C261.313 67.9645 260.564 66.2051 260.564 64.0713C260.564 61.9375 261.313 60.1593 262.811 58.7368C264.345 57.3143 266.18 56.603 268.313 56.603C270.522 56.603 272.394 57.3143 273.929 58.7368C275.464 60.1593 276.231 61.9375 276.231 64.0713C276.231 66.0928 275.464 67.8335 273.929 69.2935C272.394 70.716 270.522 71.4272 268.313 71.4272ZM274.771 141H261.744V83.5H274.771V141ZM344.175 136.396C344.175 157.509 333.562 168.065 312.337 168.065C304.85 168.065 298.318 166.811 292.74 164.303V152.399C299.029 155.993 305 157.79 310.652 157.79C324.316 157.79 331.148 151.07 331.148 137.631V131.342H330.923C326.618 138.679 320.142 142.348 311.495 142.348C304.494 142.348 298.842 139.802 294.537 134.711C290.269 129.582 288.135 122.713 288.135 114.103C288.135 104.333 290.438 96.5648 295.042 90.7998C299.646 85.0348 305.973 82.1523 314.021 82.1523C321.621 82.1523 327.255 85.2594 330.923 91.4736H331.148V83.5H344.175V136.396ZM331.26 114.665V107.196C331.26 103.153 329.913 99.7093 327.217 96.8643C324.559 93.9818 321.228 92.5405 317.222 92.5405C312.281 92.5405 308.406 94.3748 305.599 98.0435C302.828 101.675 301.443 106.766 301.443 113.317C301.443 118.97 302.772 123.499 305.43 126.906C308.125 130.275 311.682 131.959 316.099 131.959C320.591 131.959 324.241 130.35 327.049 127.13C329.856 123.874 331.26 119.718 331.26 114.665ZM412.232 141H399.205V109.555C399.205 98.1745 395.405 92.4844 387.806 92.4844C383.987 92.4844 380.768 94.1315 378.147 97.4258C375.527 100.72 374.217 104.931 374.217 110.06V141H361.133V55.873H374.217V93.0459H374.441C378.784 85.7835 384.998 82.1523 393.084 82.1523C405.849 82.1523 412.232 89.9575 412.232 105.568V141ZM458.389 140.382C455.844 141.655 452.493 142.292 448.338 142.292C437.182 142.292 431.604 136.938 431.604 126.232V93.7197H422.002V83.5H431.604V70.1919L444.632 66.4858V83.5H458.389V93.7197H444.632V122.47C444.632 125.876 445.25 128.31 446.485 129.77C447.72 131.229 449.779 131.959 452.662 131.959C454.87 131.959 456.779 131.323 458.389 130.05V140.382ZM520.831 141H507.803V108.6C507.803 97.8563 504.004 92.4844 496.404 92.4844C492.436 92.4844 489.161 93.9818 486.578 96.9766C483.995 99.9339 482.703 103.677 482.703 108.207V141H469.62V83.5H482.703V93.0459H482.928C487.233 85.7835 493.447 82.1523 501.57 82.1523C507.822 82.1523 512.595 84.1925 515.889 88.2729C519.183 92.3159 520.831 98.1745 520.831 105.849V141ZM543.348 71.4272C541.214 71.4272 539.38 70.7347 537.845 69.3496C536.347 67.9645 535.599 66.2051 535.599 64.0713C535.599 61.9375 536.347 60.1593 537.845 58.7368C539.38 57.3143 541.214 56.603 543.348 56.603C545.556 56.603 547.428 57.3143 548.963 58.7368C550.498 60.1593 551.265 61.9375 551.265 64.0713C551.265 66.0928 550.498 67.8335 548.963 69.2935C547.428 70.716 545.556 71.4272 543.348 71.4272ZM549.805 141H536.778V83.5H549.805V141ZM618.086 141H605.059V108.6C605.059 97.8563 601.259 92.4844 593.66 92.4844C589.692 92.4844 586.417 93.9818 583.833 96.9766C581.25 99.9339 579.959 103.677 579.959 108.207V141H566.875V83.5H579.959V93.0459H580.184C584.489 85.7835 590.703 82.1523 598.826 82.1523C605.078 82.1523 609.851 84.1925 613.145 88.2729C616.439 92.3159 618.086 98.1745 618.086 105.849V141ZM686.368 136.396C686.368 157.509 675.755 168.065 654.529 168.065C647.042 168.065 640.51 166.811 634.932 164.303V152.399C641.221 155.993 647.192 157.79 652.845 157.79C666.508 157.79 673.34 151.07 673.34 137.631V131.342H673.116C668.811 138.679 662.334 142.348 653.687 142.348C646.687 142.348 641.034 139.802 636.729 134.711C632.461 129.582 630.328 122.713 630.328 114.103C630.328 104.333 632.63 96.5648 637.234 90.7998C641.839 85.0348 648.165 82.1523 656.214 82.1523C663.813 82.1523 669.447 85.2594 673.116 91.4736H673.34V83.5H686.368V136.396ZM673.453 114.665V107.196C673.453 103.153 672.105 99.7093 669.41 96.8643C666.752 93.9818 663.42 92.5405 659.415 92.5405C654.473 92.5405 650.599 94.3748 647.791 98.0435C645.021 101.675 643.636 106.766 643.636 113.317C643.636 118.97 644.965 123.499 647.623 126.906C650.318 130.275 653.874 131.959 658.292 131.959C662.784 131.959 666.434 130.35 669.241 127.13C672.049 123.874 673.453 119.718 673.453 114.665Z" fill="black"/>
+<defs>
+<linearGradient id="paint0_linear_5_36" x1="127.442" y1="25.514" x2="-40.088" y2="307.246" gradientUnits="userSpaceOnUse">
+<stop stop-color="#792EE5"/>
+<stop offset="1" stop-color="#3EABB3"/>
+</linearGradient>
+</defs>
+</svg>
diff --git a/docs/assets/images/lightning.png b/docs/assets/images/lightning.png
new file mode 100755
index 0000000000000000000000000000000000000000..2d789ef09bc2c80a3dad16e2764cb4635fd0c5ac
Binary files /dev/null and b/docs/assets/images/lightning.png differ
diff --git a/docs/assets/images/mii/azure-cost.png b/docs/assets/images/mii/azure-cost.png
new file mode 100755
index 0000000000000000000000000000000000000000..942e1c4d902bf935e503e317a197741bb9468873
Binary files /dev/null and b/docs/assets/images/mii/azure-cost.png differ
diff --git a/docs/assets/images/mii/bert.png b/docs/assets/images/mii/bert.png
new file mode 100644
index 0000000000000000000000000000000000000000..a379b3169ffbb8f8ed0f527c9ea235791adf2f5d
Binary files /dev/null and b/docs/assets/images/mii/bert.png differ
diff --git a/docs/assets/images/mii/bloom.png b/docs/assets/images/mii/bloom.png
new file mode 100644
index 0000000000000000000000000000000000000000..e78664c69cde4c92f1f696e9951eb3c1c8431992
Binary files /dev/null and b/docs/assets/images/mii/bloom.png differ
diff --git a/docs/assets/images/mii/gpt.png b/docs/assets/images/mii/gpt.png
new file mode 100644
index 0000000000000000000000000000000000000000..740b94f8ff5554c81fa44ca30276b12a411398e8
Binary files /dev/null and b/docs/assets/images/mii/gpt.png differ
diff --git a/docs/assets/images/mii/hero-transparent.png b/docs/assets/images/mii/hero-transparent.png
new file mode 100755
index 0000000000000000000000000000000000000000..fd758e074883ba33ae8fffc90d3188bd246a157f
Binary files /dev/null and b/docs/assets/images/mii/hero-transparent.png differ
diff --git a/docs/assets/images/mii/hero.png b/docs/assets/images/mii/hero.png
new file mode 100755
index 0000000000000000000000000000000000000000..b7a6e0445162af2f42bcabdda09ce2a364f018f8
Binary files /dev/null and b/docs/assets/images/mii/hero.png differ
diff --git a/docs/assets/images/mii/llm-latency-sd-latency-zoom.png b/docs/assets/images/mii/llm-latency-sd-latency-zoom.png
new file mode 100755
index 0000000000000000000000000000000000000000..48182d717f828fb29a47f4aeade5605e7f0efc0c
Binary files /dev/null and b/docs/assets/images/mii/llm-latency-sd-latency-zoom.png differ
diff --git a/docs/assets/images/mii/llm-latency-sd-latency.png b/docs/assets/images/mii/llm-latency-sd-latency.png
new file mode 100755
index 0000000000000000000000000000000000000000..0632f92db51b3897f01993c2514320a63fe2b3d4
Binary files /dev/null and b/docs/assets/images/mii/llm-latency-sd-latency.png differ
diff --git a/docs/assets/images/mii/mii-arch.png b/docs/assets/images/mii/mii-arch.png
new file mode 100755
index 0000000000000000000000000000000000000000..928357d5b5269a4e30b0738eea8331ac1a4701b9
Binary files /dev/null and b/docs/assets/images/mii/mii-arch.png differ
diff --git a/docs/assets/images/mii/multi-gpu-latency.png b/docs/assets/images/mii/multi-gpu-latency.png
new file mode 100755
index 0000000000000000000000000000000000000000..df62f1b1d9dc65b95d13fb79d42912a40dd7c96c
Binary files /dev/null and b/docs/assets/images/mii/multi-gpu-latency.png differ
diff --git a/docs/assets/images/mii/opt-bloom.png b/docs/assets/images/mii/opt-bloom.png
new file mode 100755
index 0000000000000000000000000000000000000000..daab4ab59acf38f4a95223f161ed2256b32f3b33
Binary files /dev/null and b/docs/assets/images/mii/opt-bloom.png differ
diff --git a/docs/assets/images/mii/opt.png b/docs/assets/images/mii/opt.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e050b0b061e9aa605aecfbf283e2a86ea46acce
Binary files /dev/null and b/docs/assets/images/mii/opt.png differ
diff --git a/docs/assets/images/mii/roberta.png b/docs/assets/images/mii/roberta.png
new file mode 100644
index 0000000000000000000000000000000000000000..e4dca962c362e601dbadd4c0dc21d1fb29504720
Binary files /dev/null and b/docs/assets/images/mii/roberta.png differ
diff --git a/docs/assets/images/mii/sd-latency.png b/docs/assets/images/mii/sd-latency.png
new file mode 100755
index 0000000000000000000000000000000000000000..53003a9db910e0ba981333fa705fba1a841b114e
Binary files /dev/null and b/docs/assets/images/mii/sd-latency.png differ
diff --git a/docs/assets/images/mii/tput-llms.png b/docs/assets/images/mii/tput-llms.png
new file mode 100755
index 0000000000000000000000000000000000000000..5fd1dd73f75ecda222e5d0ebf7b9c4bef3f531b8
Binary files /dev/null and b/docs/assets/images/mii/tput-llms.png differ
diff --git a/docs/assets/images/moe-nlg.png b/docs/assets/images/moe-nlg.png
old mode 100644
new mode 100755
diff --git a/docs/assets/images/mosaicml.svg b/docs/assets/images/mosaicml.svg
new file mode 100755
index 0000000000000000000000000000000000000000..8f6aadb9556d401b8ed29c465bfb5222131a5a60
--- /dev/null
+++ b/docs/assets/images/mosaicml.svg
@@ -0,0 +1,38 @@
+<svg width="221" height="38" viewBox="0 0 221 38" fill="none" xmlns="http://www.w3.org/2000/svg">
+    <g clip-path="url(#clip0)">
+        <path d="M24.0822 31.9977L26.0824 23.4903L21.6462 4.62432C21.3778 3.48392 20.5595 0 17.9712 0V6.0023L23.0374 27.5463L24.0822 31.9977Z" fill="#13294E"/>
+        <path d="M48.0339 30.825L47.2726 27.5501L41.8733 4.62432C41.605 3.48392 40.7866 0 38.1964 0V6.0023L43.2626 27.5463L44.2428 31.7069C44.3012 31.9563 44.3097 32.2147 44.2679 32.4674C44.2261 32.72 44.1348 32.962 43.9992 33.1793C43.8636 33.3967 43.6863 33.5852 43.4777 33.734C43.269 33.8828 43.0329 33.9891 42.7831 34.0466C42.636 34.082 42.4852 34.0998 42.3339 34.0998C41.8957 34.0965 41.4712 33.9466 41.1283 33.6741C40.7854 33.4015 40.5438 33.0221 40.4422 32.5964L40.3375 32.1536L39.2641 27.5463L36.141 14.274L34.196 6.0023L32.1958 14.5097L35.2637 27.5463L36.6587 33.4802C36.8336 34.2261 37.154 34.9304 37.6015 35.5526C38.0489 36.1749 38.6147 36.7031 39.2666 37.107C39.9184 37.5109 40.6435 37.7826 41.4005 37.9067C42.1575 38.0307 42.9316 38.0046 43.6785 37.8299C44.4254 37.6552 45.1305 37.3352 45.7536 36.8884C46.3767 36.4415 46.9056 35.8764 47.31 35.2254C47.7144 34.5744 47.9865 33.8503 48.1107 33.0943C48.2349 32.3383 48.2088 31.5652 48.0339 30.8193V30.825Z" fill="#13294E"/>
+        <path d="M19.0375 27.5459L14.0036 6.14062L12.0034 14.6461L15.037 27.5459L16.4073 33.3714C16.5539 33.993 16.8622 35.3025 17.5283 36.3688C19.1612 35.5933 19.7949 33.2061 20.0461 32.1493L20.0842 31.9935L19.0375 27.5459Z" fill="#13294E"/>
+        <path d="M48.0339 30.825L47.2726 27.5501L41.8733 4.62432C41.605 3.48392 40.7866 0 38.1964 0V6.0023L43.2626 27.5463L44.2428 31.7069C44.3012 31.9563 44.3097 32.2147 44.2679 32.4674C44.2261 32.72 44.1348 32.962 43.9992 33.1793C43.8636 33.3967 43.6863 33.5852 43.4777 33.734C43.269 33.8828 43.0329 33.9891 42.7831 34.0466C42.636 34.082 42.4852 34.0998 42.3339 34.0998C41.8957 34.0965 41.4712 33.9466 41.1283 33.6741C40.7854 33.4015 40.5438 33.0221 40.4422 32.5964L40.3375 32.1536L39.2641 27.5463L36.141 14.274L34.196 6.0023L32.1958 14.5097L35.2637 27.5463L36.6587 33.4802C36.8336 34.2261 37.154 34.9304 37.6015 35.5526C38.0489 36.1749 38.6147 36.7031 39.2666 37.107C39.9184 37.5109 40.6435 37.7826 41.4005 37.9067C42.1575 38.0307 42.9316 38.0046 43.6785 37.8299C44.4254 37.6552 45.1305 37.3352 45.7536 36.8884C46.3767 36.4415 46.9056 35.8764 47.31 35.2254C47.7144 34.5744 47.9865 33.8503 48.1107 33.0943C48.2349 32.3383 48.2088 31.5652 48.0339 30.8193V30.825Z" fill="url(#paint0_linear)"/>
+        <path d="M21.6462 4.62432C21.3778 3.48392 20.5595 0 17.9712 0V6.0023L23.0374 27.5463L24.0822 31.9939L26.0824 23.4865L21.6462 4.62432Z" fill="url(#paint1_linear)"/>
+        <path d="M14.0036 6.14062L12.0034 14.6461L15.037 27.5459L16.4073 33.3714C16.5539 33.993 16.8622 35.3025 17.5283 36.3688C19.1612 35.5933 19.7949 33.2061 20.0461 32.1493L20.0842 31.9935L19.0375 27.5459L14.0036 6.14062Z" fill="url(#paint2_linear)"/>
+        <path d="M70.6506 13.2686C68.2508 13.2686 67.0518 15.1921 66.9433 18.1685V28.5139H62.2178V9.63834H66.9528V14.8291L68.0795 10.4746C68.6618 9.92914 69.897 8.9541 72.1503 8.9541C75.4942 8.9541 77.8921 11.0962 78.4745 14.6181L79.4927 10.4803C80.1474 9.7904 81.491 8.95981 83.5274 8.95981C87.7143 8.95981 90.2151 11.6207 90.2151 16.4389V28.5253H85.48V17.1897C85.48 14.5763 84.3172 13.2686 82.2085 13.2686C79.7744 13.2686 78.5735 15.0115 78.5735 18.0963V28.5139H73.8498V17.2258C73.8498 15.2282 73.0125 13.2686 70.6506 13.2686Z" fill="#EE3932"/>
+        <path d="M114.267 19.0047C114.267 24.8853 109.869 29.0592 103.945 29.0592C98.0203 29.0592 93.6221 24.8777 93.6221 19.0047C93.6221 13.1316 98.0203 8.94824 103.945 8.94824C109.869 8.94824 114.267 13.124 114.267 19.0047ZM103.945 24.739C107.034 24.739 109.361 22.5247 109.361 19.0047C109.361 15.4846 107.034 13.2685 103.945 13.2685C100.818 13.2685 98.5284 15.4827 98.5284 19.0047C98.5284 22.5266 100.856 24.739 103.945 24.739Z" fill="#EE3932"/>
+        <path d="M133.563 14.3573L129.129 16.028C128.984 13.8137 127.458 12.6068 125.022 12.6068C123.06 12.6068 121.897 13.4792 121.897 14.785C121.897 18.3069 133.782 15.4388 133.782 23.0263C133.782 27.164 130.075 29.0514 125.532 29.0514C120.732 29.0514 117.244 27.0918 116.553 23.4976L121.023 21.5742C121.214 24.332 123.168 25.3489 125.532 25.3489C127.275 25.3489 128.767 24.6589 128.767 23.3152C128.767 19.6127 117.065 22.6632 117.065 15.0758C117.065 11.5177 120.734 8.94043 125.024 8.94043C129.679 8.94803 132.84 11.054 133.563 14.3573Z" fill="#EE3932"/>
+        <path d="M156.793 20.0575L159.665 28.5136H154.689L152.468 21.6541L152.14 26.4057C150.76 28.0023 148.76 29.0553 145.86 29.0553C140.226 29.0553 136.3 24.7712 136.3 19.073C136.3 13.3007 140.19 8.94434 145.816 8.94434C148.543 8.94434 150.614 9.9973 152.068 11.6053V9.63808H156.793V20.0575ZM146.726 24.775C149.742 24.775 152.245 22.5607 152.245 19.073C152.245 15.5149 149.737 13.2284 146.726 13.2284C143.563 13.2284 141.207 15.5872 141.207 19.073C141.201 22.4885 143.563 24.775 146.726 24.775V24.775Z" fill="#EE3932"/>
+        <path d="M162.646 1.83398H167.552V6.55334H162.646V1.83398ZM167.44 9.63812V28.5136H162.714V9.63812H167.44Z" fill="#EE3932"/>
+        <path d="M190.123 14.3214L185.936 16.7543C185.5 14.6122 183.536 13.2685 181.211 13.2685C178.122 13.2685 175.941 15.3744 175.941 19.0047C175.941 22.6349 178.086 24.739 181.211 24.739C183.464 24.739 185.398 23.2508 185.936 21.3634L190.007 23.7221C188.736 26.7366 185.426 29.0592 181.211 29.0592C175.142 29.0592 171.035 24.9575 171.035 19.0047C171.035 13.0879 175.286 8.94824 181.211 8.94824C185.4 8.94824 188.888 11.2728 190.123 14.3214Z" fill="#EE3932"/>
+        <path d="M197.557 1.25781L200.672 11.9338L203.79 1.25781H208.5V15.5508H205.579V4.21335L202.288 15.414H198.939L195.667 4.20004V15.5565H192.751V1.26351L197.557 1.25781Z" fill="#EE3932"/>
+        <path d="M211.419 1.25781H214.419V12.7701H221V15.5508H211.419V1.25781Z" fill="#EE3932"/>
+        <path d="M38.1961 0C35.6078 0 34.7895 3.48012 34.5212 4.62432L34.1957 6.0023L28.0828 31.9977C28.0733 32.0452 28.06 32.1003 28.0466 32.1536C27.7935 33.2103 27.1617 35.5976 25.5269 36.373C26.0883 37.272 26.9009 38 28.0847 38C30.673 38 31.4913 34.5199 31.7597 33.3757L32.0471 32.1536L38.1961 6.0023C38.4226 5.04246 39.0354 2.44235 40.754 1.63077C40.1925 0.727952 39.3799 0 38.1961 0Z" fill="#EE3932"/>
+        <path d="M20.55 1.62697C19.9886 0.727955 19.1759 0 17.9922 0C15.4039 0 14.5855 3.48012 14.3172 4.62432L13.9936 6.0023L7.84265 32.1536L7.73988 32.5964C7.6376 33.0218 7.39585 33.4008 7.05308 33.6733C6.71031 33.9457 6.28618 34.0958 5.84814 34.0998C5.69684 34.1 5.54606 34.0821 5.399 34.0466C4.89657 33.9287 4.4615 33.6165 4.18927 33.1786C3.91704 32.7406 3.82987 32.2127 3.94689 31.7107L9.99321 6.0023C10.2178 5.04246 10.8306 2.44235 12.5491 1.63077C11.9877 0.731754 11.177 0.00379924 9.99321 0.00379924C7.40302 0.00379924 6.58467 3.48392 6.31632 4.62812L5.99278 6.0061L0.152005 30.825C-0.201318 32.3315 0.0590563 33.9164 0.875846 35.2311C1.69264 36.5459 2.99894 37.4827 4.50737 37.8356C6.01581 38.1885 7.60282 37.9284 8.91929 37.1127C10.2358 36.297 11.1738 34.9924 11.5271 33.4859L11.8393 32.1555L17.9884 6.0042C18.2186 5.04247 18.8315 2.44236 20.55 1.62697Z" fill="#EE3932"/>
+        <path d="M32.7535 1.62697C32.1921 0.727954 31.3794 0 30.1957 0C27.6074 0 26.789 3.48012 26.5207 4.62432L26.1971 6.0023L20.0842 31.9977L20.0461 32.1536C19.7949 33.2103 19.1612 35.5976 17.5264 36.373C18.0878 37.272 18.9004 38 20.0842 38C22.6725 38 23.4909 34.5199 23.7592 33.3757L24.0466 32.1536L30.1976 6.0023C30.4221 5.04246 31.0349 2.44235 32.7535 1.62697Z" fill="#EE3932"/>
+    </g>
+    <defs>
+        <linearGradient id="paint0_linear" x1="33.7431" y1="17.1041" x2="39.9841" y2="18.9826" gradientUnits="userSpaceOnUse">
+            <stop stop-color="#070D19"/>
+            <stop offset="1" stop-color="#13294E" stop-opacity="0"/>
+        </linearGradient>
+        <linearGradient id="paint1_linear" x1="2686.57" y1="8718.02" x2="2954.62" y2="8740.32" gradientUnits="userSpaceOnUse">
+            <stop stop-color="#070D19"/>
+            <stop offset="1" stop-color="#13294E" stop-opacity="0"/>
+        </linearGradient>
+        <linearGradient id="paint2_linear" x1="2581.7" y1="9255.66" x2="2848.61" y2="9279.07" gradientUnits="userSpaceOnUse">
+            <stop stop-color="#070D19"/>
+            <stop offset="1" stop-color="#13294E" stop-opacity="0"/>
+        </linearGradient>
+        <clipPath id="clip0">
+            <rect width="221" height="38" fill="white"/>
+        </clipPath>
+    </defs>
+</svg>
diff --git a/docs/assets/images/old-vs-new-azure.png b/docs/assets/images/old-vs-new-azure.png
new file mode 100755
index 0000000000000000000000000000000000000000..2fc710c042e9d333b4fe1f907e63eada00a0386b
Binary files /dev/null and b/docs/assets/images/old-vs-new-azure.png differ
diff --git a/docs/assets/images/onebit-adam-overview.png b/docs/assets/images/onebit-adam-overview.png
old mode 100644
new mode 100755
diff --git a/docs/assets/images/onebit-convergence.png b/docs/assets/images/onebit-convergence.png
old mode 100644
new mode 100755
diff --git a/docs/assets/images/perf-overview.png b/docs/assets/images/perf-overview.png
new file mode 100755
index 0000000000000000000000000000000000000000..7c4e08fbc1876114c97fd0c30f97c5d1abe7efb5
Binary files /dev/null and b/docs/assets/images/perf-overview.png differ
diff --git a/docs/assets/images/pipe-schedule.png b/docs/assets/images/pipe-schedule.png
old mode 100644
new mode 100755
diff --git a/docs/assets/images/quantization-8bit.png b/docs/assets/images/quantization-8bit.png
old mode 100644
new mode 100755
diff --git a/docs/assets/images/quantization-mixedbit.png b/docs/assets/images/quantization-mixedbit.png
old mode 100644
new mode 100755
diff --git a/docs/assets/images/squad-ib.png b/docs/assets/images/squad-ib.png
old mode 100644
new mode 100755
diff --git a/docs/assets/images/squad-scaling.png b/docs/assets/images/squad-scaling.png
old mode 100644
new mode 100755
diff --git a/docs/assets/images/squad-tcp.png b/docs/assets/images/squad-tcp.png
old mode 100644
new mode 100755
diff --git a/docs/assets/images/tensorboard_monitor.PNG b/docs/assets/images/tensorboard_monitor.PNG
new file mode 100644
index 0000000000000000000000000000000000000000..b62d96c335b1bf13328878496df9c4e863aeeab0
Binary files /dev/null and b/docs/assets/images/tensorboard_monitor.PNG differ
diff --git a/docs/assets/images/transformers-dark.png b/docs/assets/images/transformers-dark.png
new file mode 100755
index 0000000000000000000000000000000000000000..f48984e9c73581730e0b3886036d8a23bbdb8b89
Binary files /dev/null and b/docs/assets/images/transformers-dark.png differ
diff --git a/docs/assets/images/transformers-light.png b/docs/assets/images/transformers-light.png
new file mode 100755
index 0000000000000000000000000000000000000000..f4b5cee4d98b26b2efaca9e55b8e9a38196514ba
Binary files /dev/null and b/docs/assets/images/transformers-light.png differ
diff --git a/docs/assets/images/vmss-setup.png b/docs/assets/images/vmss-setup.png
new file mode 100755
index 0000000000000000000000000000000000000000..cb4f317cbb78841236f7e7dd9b784a85bd3316c8
Binary files /dev/null and b/docs/assets/images/vmss-setup.png differ
diff --git a/docs/assets/images/wandb_monitor.PNG b/docs/assets/images/wandb_monitor.PNG
new file mode 100644
index 0000000000000000000000000000000000000000..f65aa6c5cda8e6250256018280c2c6d206e16ca0
Binary files /dev/null and b/docs/assets/images/wandb_monitor.PNG differ
diff --git a/docs/assets/images/xtc-1.png b/docs/assets/images/xtc-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..31fde8f3e623f57a35f04399b17c5c618a5c7dcf
Binary files /dev/null and b/docs/assets/images/xtc-1.png differ
diff --git a/docs/assets/images/xtc-2.png b/docs/assets/images/xtc-2.png
new file mode 100644
index 0000000000000000000000000000000000000000..27f57ca338576717dd9e54613046d28c5abc89dd
Binary files /dev/null and b/docs/assets/images/xtc-2.png differ
diff --git a/docs/assets/images/xtc-3.png b/docs/assets/images/xtc-3.png
new file mode 100644
index 0000000000000000000000000000000000000000..2bb9d881358462322c73e39a4f7f0638d4f02c7a
Binary files /dev/null and b/docs/assets/images/xtc-3.png differ
diff --git a/docs/assets/images/xtc-4.png b/docs/assets/images/xtc-4.png
new file mode 100644
index 0000000000000000000000000000000000000000..d4946e811a4a1c843e8c39b1a76ef4879c4cde47
Binary files /dev/null and b/docs/assets/images/xtc-4.png differ
diff --git a/docs/assets/images/zero_inference_full_offload.png b/docs/assets/images/zero_inference_full_offload.png
new file mode 100644
index 0000000000000000000000000000000000000000..5e303c86219b7aaf9146bb0cce74698c43cae845
Binary files /dev/null and b/docs/assets/images/zero_inference_full_offload.png differ
diff --git a/docs/assets/images/zero_inference_model_scale.png b/docs/assets/images/zero_inference_model_scale.png
new file mode 100644
index 0000000000000000000000000000000000000000..2d1c367fd22e65b633ead983499bd6a3da961369
Binary files /dev/null and b/docs/assets/images/zero_inference_model_scale.png differ
diff --git a/docs/assets/images/zero_inference_models.png b/docs/assets/images/zero_inference_models.png
new file mode 100644
index 0000000000000000000000000000000000000000..7f96c7b099d40611562a40633ffc9bd5fe36ea31
Binary files /dev/null and b/docs/assets/images/zero_inference_models.png differ
diff --git a/docs/assets/images/zero_inference_multi_gpu.png b/docs/assets/images/zero_inference_multi_gpu.png
new file mode 100644
index 0000000000000000000000000000000000000000..0ed2f43a61348d8bdf19552543bdefed54c06a47
Binary files /dev/null and b/docs/assets/images/zero_inference_multi_gpu.png differ
diff --git a/docs/assets/images/zero_inference_prefetch.png b/docs/assets/images/zero_inference_prefetch.png
new file mode 100644
index 0000000000000000000000000000000000000000..d46d616ba83a0652d014d47bef0f116176a0ccf2
Binary files /dev/null and b/docs/assets/images/zero_inference_prefetch.png differ
diff --git a/docs/assets/images/zero_inference_token_count_batch_size.png b/docs/assets/images/zero_inference_token_count_batch_size.png
new file mode 100644
index 0000000000000000000000000000000000000000..a54b7d28ff6c653ac5b9622038bac934d04636cf
Binary files /dev/null and b/docs/assets/images/zero_inference_token_count_batch_size.png differ
diff --git a/docs/assets/images/zero_inference_token_count_cpu_throughput.png b/docs/assets/images/zero_inference_token_count_cpu_throughput.png
new file mode 100644
index 0000000000000000000000000000000000000000..5f97ad4e9f8bb61c614d7b482016b5c423067370
Binary files /dev/null and b/docs/assets/images/zero_inference_token_count_cpu_throughput.png differ
diff --git a/docs/assets/images/zero_inference_token_count_nvme_throughput.png b/docs/assets/images/zero_inference_token_count_nvme_throughput.png
new file mode 100644
index 0000000000000000000000000000000000000000..9c617c6eab172fe96c6a68dd0a14422cbe069aa9
Binary files /dev/null and b/docs/assets/images/zero_inference_token_count_nvme_throughput.png differ
diff --git a/docs/code-docs/build-api-docs.sh b/docs/code-docs/build-api-docs.sh
old mode 100644
new mode 100755
diff --git a/docs/code-docs/source/conf.py b/docs/code-docs/source/conf.py
index cb00d0d6be457525234563dc0ede6364ad1adae0..059aa7c0b67c60a6fd2f579563eabd9a7399f24f 100644
--- a/docs/code-docs/source/conf.py
+++ b/docs/code-docs/source/conf.py
@@ -1,3 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 # Configuration file for the Sphinx documentation builder.
 #
 # This file only contains a selection of the most common options. For a full
@@ -20,7 +22,8 @@ copyright = '2020, Microsoft'
 author = 'Microsoft'
 
 # The full version, including alpha/beta/rc tags
-release = '0.3.0'
+with open("../../../version.txt", "r") as f:
+    release = f.readline().rstrip()
 
 master_doc = 'index'
 
@@ -37,10 +40,25 @@ extensions = [
     'sphinx.ext.viewcode',
     'recommonmark',
     'sphinx_rtd_theme',
+    'sphinxcontrib.autodoc_pydantic',
+    'sphinx.ext.autosectionlabel',
 ]
 
 pygments_style = 'sphinx'
 
+# autodoc_pyandtic config
+autodoc_pydantic_model_show_field_summary = False
+autodoc_pydantic_field_signature_prefix = ' '
+autodoc_pydantic_model_signature_prefix = 'class'
+autodoc_pydantic_model_show_json = False
+autodoc_pydantic_model_show_config_summary = False
+autodoc_pydantic_model_show_config_member = False
+autodoc_pydantic_model_show_validator_summary = False
+autodoc_pydantic_model_show_validator_members = False
+autodoc_pydantic_model_summary_list_order = 'bysource'
+autodoc_pydantic_model_member_order = 'bysource'
+autodoc_pydantic_field_list_validators = False
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 
@@ -70,14 +88,11 @@ html_context = {
     "conf_py_path": "/docs/code-docs/source/",
 }
 
-# Mock imports so we don't have to install torch to build the docs.
-from unittest.mock import MagicMock
-
 sys.path.insert(0, os.path.abspath('../../../'))
 
 # Prepend module names to class descriptions?
 add_module_names = True
 
-autoclass_content = 'both'
+autoclass_content = 'auto'
 
 autodoc_mock_imports = ["apex", "mpi4py", "tensorboardX", "numpy", "cupy"]
diff --git a/docs/code-docs/source/index.rst b/docs/code-docs/source/index.rst
index 293d93830e6b18b992352ca38394ccabfac1f7ac..67d5aa5fe9fb3d695323aaa7362f4de1247a3993 100644
--- a/docs/code-docs/source/index.rst
+++ b/docs/code-docs/source/index.rst
@@ -19,7 +19,7 @@ Training API
    training
 
 Inference API
-------------
+-------------
 
 .. toctree::
    :maxdepth: 2
@@ -64,7 +64,7 @@ Pipeline Parallelism
    pipeline
 
 Optimizers
---------------------
+----------
 .. toctree::
    :maxdepth: 2
 
@@ -98,6 +98,13 @@ Memory Usage
 
    memory
 
+Monitoring
+----------
+.. toctree::
+   :maxdepth: 2
+
+   monitor
+
 Indices and tables
 ------------------
 
diff --git a/docs/code-docs/source/inference-init.rst b/docs/code-docs/source/inference-init.rst
index b4a155cf5588b76898b702716316140ca0b7c3f7..5ac01c1932d48b2d0e35aba1a82ace8d6b11663b 100644
--- a/docs/code-docs/source/inference-init.rst
+++ b/docs/code-docs/source/inference-init.rst
@@ -6,6 +6,37 @@ Example usage:
 
 .. code-block:: python
 
-    engine = deepspeed.init_inference(model=net)
+    engine = deepspeed.init_inference(model=net, config=config)
+
+The ``DeepSpeedInferenceConfig`` is used to control all aspects of initializing
+the ``InferenceEngine``. The config should be passed as a dictionary to
+``init_inference``, but parameters can also be passed as keyword arguments.
+
+.. _DeepSpeedInferenceConfig:
+.. autopydantic_model:: deepspeed.inference.config.DeepSpeedInferenceConfig
+
+.. _DeepSpeedTPConfig:
+.. autopydantic_model:: deepspeed.inference.config.DeepSpeedTPConfig
+
+.. _DeepSpeedMoEConfig:
+.. autopydantic_model:: deepspeed.inference.config.DeepSpeedMoEConfig
+
+.. _QuantizationConfig:
+.. autopydantic_model:: deepspeed.inference.config.QuantizationConfig
+
+.. _InferenceCheckpointConfig:
+.. autopydantic_model:: deepspeed.inference.config.InferenceCheckpointConfig
+
+
+Example config:
+
+.. code-block:: python
+
+    config = {
+	"kernel_inject": True,
+	"tensor_parallel": {"tp_size": 4},
+	"dtype": "fp16",
+	"enable_cuda_graph": False
+    }
 
 .. autofunction:: deepspeed.init_inference
diff --git a/docs/code-docs/source/memory.rst b/docs/code-docs/source/memory.rst
index 5c92dc199aa4de6002adb6f45a78c01881673ea7..28e96955d41f974561e86c4a56b65a5f9996b0ad 100644
--- a/docs/code-docs/source/memory.rst
+++ b/docs/code-docs/source/memory.rst
@@ -7,9 +7,9 @@ API To Estimate Memory Usage
 
 ZeRO2:
 
-.. autofunction:: deepspeed.runtime.zero.stage2.estimate_zero2_model_states_mem_needs_all_live
+.. autofunction:: deepspeed.runtime.zero.stage_1_and_2.estimate_zero2_model_states_mem_needs_all_live
 
-.. autofunction:: deepspeed.runtime.zero.stage2.estimate_zero2_model_states_mem_needs_all_cold
+.. autofunction:: deepspeed.runtime.zero.stage_1_and_2.estimate_zero2_model_states_mem_needs_all_cold
 
 Examples:
 
@@ -18,15 +18,16 @@ Let's try a 3B model with just 1 node with 8 gpus, using live model:
 .. code-block:: bash
 
     python -c 'from transformers import AutoModel; \
-    from deepspeed.runtime.zero.stage2 import estimate_zero2_model_states_mem_needs_all_live; \
+    from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_live; \
     model = AutoModel.from_pretrained("t5-3b"); \
     estimate_zero2_model_states_mem_needs_all_live(model, num_gpus_per_node=8, num_nodes=1)'
+
     Estimated memory needed for params, optim states and gradients for a:
     HW: Setup with 1 node, 8 GPUs per node.
     SW: Model with 2851M total params.
       per CPU  |  per GPU |   Options
-      127.48GB |   5.31GB | cpu_offload=1
-      127.48GB |  15.93GB | cpu_offload=0
+      127.48GB |   5.31GB | offload_optimizer=cpu
+      127.48GB |  15.93GB | offload_optimizer=none
 
 Now, without the actual model, which requires us to know ``total_params`` and
 ``largest_layer_params``, but we got those from the run above, so future estimators are now much
@@ -34,14 +35,15 @@ faster as we don't need to load the model.
 
 .. code-block:: bash
 
-    python -c 'from deepspeed.runtime.zero.stage2 import estimate_zero2_model_states_mem_needs_all_cold; \
+    python -c 'from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_cold; \
     estimate_zero2_model_states_mem_needs_all_cold(total_params=2851e6, num_gpus_per_node=8, num_nodes=1)'
+
     Estimated memory needed for params, optim states and gradients for a:
     HW: Setup with 1 node, 8 GPUs per node.
     SW: Model with 2851M total params.
       per CPU  |  per GPU |   Options
-      127.45GB |   5.31GB | cpu_offload=1
-      127.45GB |  15.93GB | cpu_offload=0
+      127.45GB |   5.31GB | offload_optimizer=cpu
+      127.45GB |  15.93GB | offload_optimizer=none
 
 There is a slight difference due to rounding - the actual live model has a few more params
 
@@ -67,12 +69,12 @@ Let's try a 3B model with just 1 node with 8 gpus, using live model:
     HW: Setup with 1 node, 8 GPUs per node.
     SW: Model with 2851M total params, 32M largest layer params.
       per CPU  |  per GPU |   Options
-       71.71GB |   0.12GB | cpu_offload=1, cpu_offload_params=1, zero_init=1
-      127.48GB |   0.12GB | cpu_offload=1, cpu_offload_params=1, zero_init=0
-       63.74GB |   0.79GB | cpu_offload=1, cpu_offload_params=0, zero_init=1
-      127.48GB |   0.79GB | cpu_offload=1, cpu_offload_params=0, zero_init=0
-        1.47GB |   6.10GB | cpu_offload=0, cpu_offload_params=0, zero_init=1
-      127.48GB |   6.10GB | cpu_offload=0, cpu_offload_params=0, zero_init=0
+       71.71GB |   0.12GB | offload_param=cpu , offload_optimizer=cpu , zero_init=1
+      127.48GB |   0.12GB | offload_param=cpu , offload_optimizer=cpu , zero_init=0
+       63.74GB |   0.79GB | offload_param=none, offload_optimizer=cpu , zero_init=1
+      127.48GB |   0.79GB | offload_param=none, offload_optimizer=cpu , zero_init=0
+        1.47GB |   6.10GB | offload_param=none, offload_optimizer=none, zero_init=1
+      127.48GB |   6.10GB | offload_param=none, offload_optimizer=none, zero_init=0
 
 Now, without the actual model, which requires us to know ``total_params`` and
 ``largest_layer_params``, but we got those from the run above, so future estimators are now much
@@ -87,12 +89,12 @@ faster as we don't need to load the model.
     HW: Setup with 1 node, 8 GPUs per node.
     SW: Model with 2851M total params, 32M largest layer params.
       per CPU  |  per GPU |   Options
-       71.69GB |   0.12GB | cpu_offload=1, cpu_offload_params=1, zero_init=1
-      127.45GB |   0.12GB | cpu_offload=1, cpu_offload_params=1, zero_init=0
-       63.72GB |   0.78GB | cpu_offload=1, cpu_offload_params=0, zero_init=1
-      127.45GB |   0.78GB | cpu_offload=1, cpu_offload_params=0, zero_init=0
-        1.43GB |   6.09GB | cpu_offload=0, cpu_offload_params=0, zero_init=1
-      127.45GB |   6.09GB | cpu_offload=0, cpu_offload_params=0, zero_init=0
+       71.69GB |   0.12GB | offload_param=cpu , offload_optimizer=cpu , zero_init=1
+      127.45GB |   0.12GB | offload_param=cpu , offload_optimizer=cpu , zero_init=0
+       63.72GB |   0.78GB | offload_param=none, offload_optimizer=cpu , zero_init=1
+      127.45GB |   0.78GB | offload_param=none, offload_optimizer=cpu , zero_init=0
+        1.43GB |   6.09GB | offload_param=none, offload_optimizer=none, zero_init=1
+      127.45GB |   6.09GB | offload_param=none, offload_optimizer=none, zero_init=0
 
 There is a slight difference due to rounding - the actual live model has a few more params
 
diff --git a/docs/code-docs/source/monitor.rst b/docs/code-docs/source/monitor.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a0ab894a97e4ea29f4358f5845f10cbf5450e3d0
--- /dev/null
+++ b/docs/code-docs/source/monitor.rst
@@ -0,0 +1,35 @@
+Monitoring
+==========
+
+Deepspeed’s Monitor module can log training details into a
+Tensorboard-compatible file, to WandB, or to simple CSV files. Below is an
+overview of what DeepSpeed will log automatically.
+
+.. csv-table:: Automatically Logged Data
+    :header: "Field", "Description", "Condition"
+    :widths: 20, 20, 10
+
+    `Train/Samples/train_loss`,The training loss.,None
+    `Train/Samples/lr`,The learning rate during training.,None
+    `Train/Samples/loss_scale`,The loss scale when training using `fp16`.,`fp16` must be enabled.
+    `Train/Eigenvalues/ModelBlockParam_{i}`,Eigen values per param block.,`eigenvalue` must be enabled.
+    `Train/Samples/elapsed_time_ms_forward`,The global duration of the forward pass.,`flops_profiler.enabled` or `wall_clock_breakdown`.
+    `Train/Samples/elapsed_time_ms_backward`,The global duration of the forward pass.,`flops_profiler.enabled` or `wall_clock_breakdown`.
+    `Train/Samples/elapsed_time_ms_backward_inner`,The backward time that does not include the the gradient reduction time. Only in cases where the gradient reduction is not overlapped, if it is overlapped then the inner time should be about the same as the entire backward time.,`flops_profiler.enabled` or `wall_clock_breakdown`.
+    `Train/Samples/elapsed_time_ms_backward_allreduce`,The global duration of the allreduce operation.,`flops_profiler.enabled` or `wall_clock_breakdown`.
+    `Train/Samples/elapsed_time_ms_step`,The optimizer step time,`flops_profiler.enabled` or `wall_clock_breakdown`.
+
+TensorBoard
+-----------
+.. _TensorBoardConfig:
+.. autopydantic_model:: deepspeed.monitor.config.TensorBoardConfig
+
+WandB
+-----
+.. _WandbConfig:
+.. autopydantic_model:: deepspeed.monitor.config.WandbConfig
+
+CSV Monitor
+-----------
+.. _CSVConfig:
+.. autopydantic_model:: deepspeed.monitor.config.CSVConfig
diff --git a/docs/code-docs/source/optimizers.rst b/docs/code-docs/source/optimizers.rst
old mode 100644
new mode 100755
diff --git a/docs/code-docs/source/schedulers.rst b/docs/code-docs/source/schedulers.rst
old mode 100644
new mode 100755
diff --git a/docs/code-docs/source/zero3.rst b/docs/code-docs/source/zero3.rst
index daced77d909396a9ee60e2764dc324b358364856..60c3470185896371e85041d23399bca731f8b143 100644
--- a/docs/code-docs/source/zero3.rst
+++ b/docs/code-docs/source/zero3.rst
@@ -51,6 +51,24 @@ for a complete list of options for configuration and performance tuning.
         our `optimizer config <https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`_
         to instruct :meth:`deepspeed.initialize` to build the optimizer for you.
 
+ZeRO Configurations
+===================
+
+All the settings for DeepSpeed ZeRO are set with the `DeepSpeedZeroConfig`_.
+The dictionary provided under the ``zero_optimization`` entry of the main
+DeepSpeed configuration dict will be parsed and validated with this class.
+Sub-configurations for parameter offload and optimzer offload settings are
+parsed by `DeepSpeedZeroOffloadParamConfig`_ and
+`DeepSpeedZeroOffloadOptimizerConfig`_.
+
+.. _DeepSpeedZeroConfig:
+.. autopydantic_model:: deepspeed.runtime.zero.config.DeepSpeedZeroConfig
+
+.. _DeepSpeedZeroOffloadParamConfig:
+.. autopydantic_model:: deepspeed.runtime.zero.config.DeepSpeedZeroOffloadParamConfig
+
+.. _DeepSpeedZeroOffloadOptimizerConfig:
+.. autopydantic_model:: deepspeed.runtime.zero.config.DeepSpeedZeroOffloadOptimizerConfig
 
 
 Example ZeRO-3 Configurations
@@ -275,3 +293,41 @@ parallelism to fit them in limited GPU memory.
 
 .. autoclass:: deepspeed.zero.TiledLinear
     :members:
+
+
+Debugging
+---------
+
+Debugging ZeRO training is complicated by the partitioning of parameters, gradients, and optimizer states. None of these 3 groups of tensors (model states) can be normally accessed because of that. To overcome that DeepSpeed provides the following routines for accessing individual model states in their unpartitioned form.
+
+Important: Please note that these utilities must be called by all processes participating in the training, even if you decide to do something with the result only in the main process. If all processes don't participate these utilities will hang waiting for all processes to send their contribution.
+
+Additionally, you must be aware that these routines return correct data only in specific phases of the training. So for examples the gradients are valid after ``backward`` and before ``step``. The optimizer states are updated after ``step``. Same goes for fp32 master weights.
+
+.. autofunction:: deepspeed.utils.safe_get_full_fp32_param
+
+.. autofunction:: deepspeed.utils.safe_get_full_grad
+
+.. autofunction:: deepspeed.utils.safe_get_full_optimizer_state
+
+
+These routines can be used in a training loop as shown in the following snippet.
+
+.. code-block:: python
+
+    backward(loss)
+    [...]
+    from deepspeed.utils import safe_get_full_fp32_param, safe_get_full_grad, safe_get_full_optimizer_state
+    for n, lp in model.named_parameters():
+        # 1. gradient lookup
+        # For zero1 and zero2, gradient lookup must be called after `backward` and before `step`
+        # For zero3, gradient lookup must be called after `backward`
+        hp_grad = safe_get_full_grad(lp)
+
+        # 2. fp32 and optim states can probably be called anywhere in the training loop, but will be updated after `step`
+        hp = safe_get_full_fp32_param(lp)
+        exp_avg = safe_get_full_optimizer_state(lp, "exp_avg")
+        exp_avg_sq = safe_get_full_optimizer_state(lp, "exp_avg_sq")
+
+    [...]
+    optimizer.step()
diff --git a/docs/index.md b/docs/index.md
old mode 100644
new mode 100755
index 38830ce3141748282c3dad989ccc193827d512fb..79c5ad21f058c9249c4345bc9330f6f58eb930b5
--- a/docs/index.md
+++ b/docs/index.md
@@ -5,211 +5,83 @@ toc_label: "Contents"
 title: "Latest News"
 
 ---
+<b> DeepSpeed trained the world's most powerful language models ([MT-530B](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/), [BLOOM](https://huggingface.co/blog/bloom-megatron-deepspeed)); [learn how](https://www.deepspeed.ai/tutorials/large-models-w-deepspeed/).</b>
 
-* [2022/03/21] [Supporting efficient large model training on AMD Instinct GPUs with DeepSpeed](https://cloudblogs.microsoft.com/opensource/2022/03/21/supporting-efficient-large-model-training-on-amd-instinct-gpus-with-deepspeed/)
-* [2022/03/07] [Maximizing Communication Efficiency for Large-scale Training via 0/1 Adam](https://www.deepspeed.ai/tutorials/zero-one-adam/)
-* [2022/01/19] [DeepSpeed: Advancing MoE inference and training to power next-generation AI scale](https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/)
-    * [Mixture of Experts (MoE) for NLG tutorial](https://www.deepspeed.ai/tutorials/mixture-of-experts-nlg/).
-    * [Mixture of Experts (MoE) Inference tutorial](https://www.deepspeed.ai/tutorials/moe-inference-tutorial).
-* [2021/11/15] [Autotuning: Automatically discover the optimal DeepSpeed configuration that delivers good training speed](https://www.deepspeed.ai/2021/11/16/autotuning.html)
-* [2021/10/11] [Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, the World’s Largest and Most Powerful Generative Language Model](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/)
-  * Read more on how to [train large models with DeepSpeed](https://www.deepspeed.ai/tutorials/large-models-w-deepspeed/)
-
-
-<b> DeepSpeed+Megatron trained the world's most powerful language model: [MT-530B](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/) <b>
-
-<b> DeepSpeed is hiring, [come join us!](https://careers.microsoft.com/us/en/search-results?keywords=http:%2F%2Fdeepspeed.ai) </b>
-
-DeepSpeed is a deep learning optimization library that makes distributed training easy,
-efficient, and effective.
-
-<p align="center"><i><b>10x Larger Models</b></i></p>
-<p align="center"><i><b>10x Faster Training</b></i></p>
-<p align="center"><i><b>Minimal Code Change</b></i></p>
-
-DeepSpeed delivers extreme-scale model training for everyone, from data scientists training on massive supercomputers to those training on low-end clusters or even on a single GPU:
-* Extreme scale: Using current generation of GPU clusters with hundreds of devices,  3D parallelism of DeepSpeed can efficiently train deep learning models with trillions of parameters.
-* Extremely memory efficient: With just a single GPU, ZeRO-Offload of DeepSpeed can train models with over 10B parameters, 10x bigger than the state of the art, democratizing multi-billion-parameter model training such that many deep learning scientists can explore bigger and better models.
-* Extremely long sequence length: Sparse attention of DeepSpeed powers an order-of-magnitude longer input sequence and obtains up to 6x faster execution comparing with dense transformers.
-* Extremely communication efficient: 3D parallelism improves communication efficiency allows users to train multi-billion-parameter models 2–7x faster on clusters with limited network bandwidth.  1-bit Adam, 0/1 Adam and 1-bit LAMB reduce communication volume by up to 26x while achieving similar convergence efficiency to Adam/LAMB, allowing for scaling to different types of GPU clusters and networks.
-
-Early adopters of DeepSpeed have already produced
-a language model (LM) with over 17B parameters called
-[Turing-NLG](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft),
-establishing a new SOTA in the LM category.
-
-DeepSpeed is an important part of Microsoft’s new
-[AI at Scale](https://www.microsoft.com/en-us/research/project/ai-at-scale/)
-initiative to enable next-generation AI capabilities at scale, where you can find more
-information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale).
-
-# Why DeepSpeed?
-Training advanced deep learning models is challenging. Beyond model design,
-model scientists also need to set up the state-of-the-art training techniques
-such as distributed training, mixed precision, gradient accumulation, and
-checkpointing. Yet still, scientists may not achieve the desired system
-performance and convergence rate. Large model sizes are even more challenging:
-a large model easily runs out of memory with pure data parallelism and it is
-difficult to use model parallelism. DeepSpeed addresses these challenges to
-accelerate model development *and* training.
-
-## Distributed, Effective, and Efficient Training with Ease
-The DeepSpeed API is a lightweight wrapper on [PyTorch](https://pytorch.org/). This
-means that you can use everything you love in PyTorch and without learning a new
-platform. In addition, DeepSpeed manages all of the boilerplate state-of-the-art
-training techniques, such as distributed training, mixed precision, gradient
-accumulation, and checkpoints so that you can focus on your model development. Most
-importantly, you can leverage the distinctive efficiency and effectiveness benefit of
-DeepSpeed to boost speed and scale with just a few lines of code changes to your PyTorch
-models.
+* [2023/02] [Automatic Tensor Parallelism: Enables tensor parallelism by default without providing an injection policy](https://www.deepspeed.ai/tutorials/automatic-tensor-parallelism/)
+* [2022/12] [DeepSpeed Data Efficiency: A composable library that makes better use of data, increases training efficiency, and improves model quality](https://www.deepspeed.ai/2022/12/11/data-efficiency.html)
+* [2022/11] [Stable Diffusion Image Generation under 1 second w. DeepSpeed MII](https://github.com/microsoft/DeepSpeed-MII/tree/main/examples/benchmark/txt2img)
+* [2022/10] [DeepSpeed-MII: instant speedup on 24,000+ open-source DL models with up to 40x cheaper inference](https://www.deepspeed.ai/2022/10/10/mii.html)
+* [2022/09] [ZeRO-Inference: Democratizing massive model inference](https://www.deepspeed.ai/2022/09/09/zero-inference.html)
+* [2022/07] [Azure and DeepSpeed empower easy-to-use and high-performance model training](https://azure.microsoft.com/en-us/blog/azure-empowers-easytouse-highperformance-and-hyperscale-model-training-using-deepspeed/)
+
+
+# Extreme Speed and Scale for DL Training and Inference
+
+   DeepSpeed is an easy-to-use deep learning optimization software suite that enables unprecedented scale and speed for Deep Learning Training and Inference. With DeepSpeed you can:
+
+* Train/Inference dense or sparse models with billions or trillions of parameters
+* Achieve excellent system throughput and efficiently scale to thousands of GPUs
+* Train/Inference on resource constrained GPU systems
+* Achieve unprecedented low latency and high thoughput for inference
+* Achieve extreme compression for an unparalleled inference latency and model size reduction with low costs
+
+
+# DeepSpeed has three innovation pillars:
+
+![Three innovation pillars](/assets/images/3pillars.png){: .align-center}
+
+
+## DeepSpeed-Training
+
+DeepSpeed offers a confluence of system innovations, that has made large scale DL training effective, and efficient, greatly improved ease of use, and redefined the DL training landscape in terms of scale that is possible. These innovations such as ZeRO, 3D-Parallelism, DeepSpeed-MoE, ZeRO-Infinity, etc fall under the DeepSpeed-Training pillar. Learn more: [DeepSpeed-Training](https://www.deepspeed.ai/training)
+
+## DeepSpeed-Inference
+
+DeepSpeed brings together innovations in parallelism technology such as tensor, pipeline, expert and ZeRO-parallelism, and combines them with high performance custom inference kernels, communication optimizations and heterogeneous memory technologies to enable inference at an unprecedented scale, while achieving unparalleled latency, thoughput and cost reduction. This systematic composition of system technologies for inference falls under the DeepSpeed-Inference. Learn more: [DeepSpeed-Inference](https://www.deepspeed.ai/inference)
+
+## DeepSpeed-Compression
+
+To further increase the inference efficiency, DeepSpeed offers easy-to-use and flexible-to-compose compression techniques for researchers and practitioners to compress their models while delivering faster speed, smaller model size, and significantly reduced compression cost. Moreover, SoTA innovations on compression like ZeroQuant and XTC are included under the DeepSpeed-Compression pillar. Learn more: [DeepSpeed-Compression](https://www.deepspeed.ai/compression)
+
+# DeepSpeed Software Suite
+
+## DeepSpeed Library
+
+   The [DeepSpeed](https://github.com/microsoft/deepspeed) library implements and packages the innovations and technologies in DeepSpeed Training, Inference and Compression Pillars into a single easy-to-use, open-sourced repository. It allows for easy composition of multitude of features within a single training, infernece or compression pipeline. The DeepSpeed Library is heavily adopted by the DL community, and has been used to enable some of the most powerful models (see [DeepSpeed Adoption](#deepspeed-adoption)).
+
+## Model Implementations for Inference (MII)
+
+   [Model Implementations for Inference (MII)](https://github.com/microsoft/deepspeed-mii) is an open-sourced repository for making low-latency and high-throughput inference accessible to all data scientists by alleviating the need to apply complex system optimization techniques themselves. Out-of-box, MII offers support for thousands of widely used DL models, optimized using DeepSpeed-Inference, that can be deployed with a few lines of code, while achieving significant latency reduction compared to their vanilla open-sourced versions.
+
+## DeepSpeed on Azure
+
+   DeepSpeed users are diverse and have access to different environments. We recommend to try DeepSpeed on Azure as it is the simplest and easiest method. The recommended method to try DeepSpeed on Azure is through AzureML [recipes](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed). The job submission and data preparation scripts have been made available [here](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azureml). For more details on how to use DeepSpeed on Azure, please follow the [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/).
+
+# DeepSpeed Adoption
+
+DeepSpeed has been used to train many different large-scale models, below is a list of several examples that we are aware of (if you'd like to include your model please submit a PR):
+
+  * [Megatron-Turing NLG (530B)](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/)
+  * [Jurassic-1 (178B)](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)
+  * [BLOOM (176B)](https://huggingface.co/blog/bloom-megatron-deepspeed)
+  * [GLM (130B)](https://github.com/THUDM/GLM-130B)
+  * [YaLM (100B)](https://github.com/yandex/YaLM-100B)
+  * [GPT-NeoX (20B)](https://github.com/EleutherAI/gpt-neox)
+  * [AlexaTM (20B)](https://www.amazon.science/blog/20b-parameter-alexa-model-sets-new-marks-in-few-shot-learning)
+  * [Turing NLG (17B](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/)
+  * [METRO-LM (5.4B)](https://arxiv.org/pdf/2204.06644.pdf)
+
+DeepSpeed has been integrated with several different popular open-source DL frameworks such as:
+
+|                                                                                                | Documentation                                |
+| ---------------------------------------------------------------------------------------------- | -------------------------------------------- |
+| <img src="assets/images/transformers-light.png" width="300px"> | [Transformers with DeepSpeed](https://huggingface.co/docs/transformers/main/main_classes/deepspeed) |
+| <img src="assets/images/accelerate-light.png" width="300px">| [Accelerate with DeepSpeed](https://huggingface.co/docs/accelerate/main/en/deepspeed) |
+| <img src="assets/images/lightning-light.svg" width="250px"> | [Lightning with DeepSpeed](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html) |
+| <img src="assets/images/mosaicml.svg" width="250px"> | [MosaicML with DeepSpeed](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration) |
+
+DeepSpeed is an integral part of [Microsoft’s AI at Scale initiative](https://www.microsoft.com/en-us/research/project/ai-at-scale/) to enable next-generation AI capabilities at scale.
 
-## Speed
-DeepSpeed achieves high performance and fast convergence through a combination of
-efficiency optimizations on compute/communication/memory/IO and effectiveness
-optimizations on advanced hyperparameter tuning and optimizers. For example:
-
-* <span style="color:dodgerblue">DeepSpeed trains BERT-large to parity in 44
-  mins using 1024 V100 GPUs (64 DGX-2 boxes) and in 2.4 hours using 256 GPUs
-  (16 DGX-2 boxes).</span>
-
-  **BERT-large Training Times**
-
-  | Devices        | Source    |        Training Time  |
-  | -------------- | --------- | ---------------------:|
-  | 1024 V100 GPUs | DeepSpeed |             **44** min|
-  | 256 V100 GPUs  | DeepSpeed |             **2.4** hr|
-  | 64 V100 GPUs   | DeepSpeed |            **8.68** hr|
-  | 16 V100 GPUs   | DeepSpeed |           **33.22** hr|
-
-  *BERT codes and tutorials will be available soon.*
-
-* DeepSpeed trains GPT2 (1.5 billion parameters) 3.75x faster than state-of-art, NVIDIA
-  Megatron on Azure GPUs.
-
-  *Read more*: [GPT tutorial](/tutorials/megatron/)
-
-
-
-## Memory efficiency
-DeepSpeed provides memory-efficient data parallelism and enables training models without
-model parallelism. For example, DeepSpeed can train models with up to 13 billion parameters on
-a single GPU. In comparison, existing frameworks (e.g.,
-PyTorch's Distributed Data Parallel) run out of memory with 1.4 billion parameter models.
-
-DeepSpeed reduces the training memory footprint through a novel solution called Zero
-Redundancy Optimizer (ZeRO). Unlike basic data parallelism where memory states are
-replicated across data-parallel processes, ZeRO partitions model states and gradients to save
-significant memory. Furthermore, it also reduces activation memory and fragmented memory.
-The current implementation (ZeRO-2) reduces memory by up to
-8x relative to the state-of-art. You can read more about ZeRO in our [paper](https://arxiv.org/abs/1910.02054), and
-in our blog posts related to
-[ZeRO-1](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/) and [ZeRO-2](https://www.microsoft.com/en-us/research/blog/zero-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/).
-
-With this impressive memory reduction, early adopters of DeepSpeed have already
-produced  a language model (LM) with over 17B parameters called
-<a href="https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft">
-<span style="color:dodgerblue">Turing-NLG</span></a>,
-establishing a new SOTA in the LM category.
-
-For model scientists with limited GPU resources, ZeRO-Offload leverages both CPU and GPU memory for training large models. Using a machine with **a single GPU**, our users can run **models of up to 13 billion parameters** without running out of memory, 10x bigger than the existing approaches, while obtaining competitive throughput. This feature democratizes multi-billion-parameter model training and opens the window for many deep learning practitioners to explore bigger and better models.
-
-## Scalability
-DeepSpeed supports efficient data parallelism, model parallelism, pipeline parallelism and their
-combinations, which we call 3D parallelism.
-* <span style="color:dodgerblue">3D parallelism of DeepSpeed provides system support to run models with trillions of parameters, read more in our [press-release]({{ site.press_release_v3 }}) and [tutorial](/tutorials/pipeline).</span>
-* <span style="color:dodgerblue">DeepSpeed can run large models more efficiently, up to 10x
-  faster for models with
-  various sizes spanning 1.5B to hundred billion.</span> More specifically, the data parallelism powered by ZeRO
-  is complementary and can be combined with different types of model parallelism.  It allows
-  DeepSpeed to fit models using lower degree of model parallelism and higher batch size, offering
-  significant performance gains compared to using model parallelism alone.
-
-  *Read more*: [ZeRO paper](https://arxiv.org/abs/1910.02054),
-  and [GPT tutorial](/tutorials/megatron).
-
-![DeepSpeed Speedup](/assets/images/deepspeed-speedup.png)
-<p align="center">
-<em>The figure depicts system throughput improvements of DeepSpeed (combining ZeRO-powered data parallelism with model parallelism of NVIDIA Megatron-LM) over using Megatron-LM alone.</em>
-</p>
-
-## Communication efficiency
-Pipeline parallelism of DeepSpeed reduce communication volume during distributed training, which allows users to train multi-billion-parameter models 2–7x faster on clusters with limited network bandwidth.
-![Low-bandwidth GPT-2 Performance](/assets/images/pp-lowbw-gpt2.png)
-
-1-bit Adam, 0/1 Adam and 1-bit LAMB reduce communication volume by up to 26x while achieving similar convergence efficiency to Adam, allowing for scaling to different types of GPU clusters and networks.  [1-bit Adam blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html), [1-bit Adam tutorial](https://www.deepspeed.ai/tutorials/onebit-adam/), [0/1 Adam tutorial](https://www.deepspeed.ai/tutorials/zero-one-adam/), [1-bit LAMB tutorial](https://www.deepspeed.ai/tutorials/onebit-lamb/).
-
-## Supporting long sequence length
-DeepSpeed offers sparse attention kernels—an instrumental technology to support long sequences of model inputs, whether for text, image, or sound. Compared with the classic dense Transformers, it powers **an order-of-magnitude longer input sequence** and obtains up to 6x faster execution with comparable accuracy. It also outperforms state-of-the-art sparse implementations with 1.5–3x faster execution. Furthermore, our sparse kernels support efficient execution of flexible sparse format and empower users to innovate on their custom sparse structures.  [Read more here](https://www.deepspeed.ai/2020/09/08/sparse-attention.html).
-
-
-## Fast convergence for effectiveness
-DeepSpeed supports advanced hyperparameter tuning and large batch size
-optimizers such as [LAMB](https://arxiv.org/abs/1904.00962). These improve the
-effectiveness of model training and reduce the number of samples required to
-convergence to desired accuracy.
-
-*Read more*: [Tuning tutorial](/tutorials/one-cycle).
-
-
-## Good Usability
-Only a few lines of code changes are needed to enable a PyTorch model to use DeepSpeed and ZeRO. Compared to current model parallelism libraries, DeepSpeed does not require a code redesign or model refactoring. It also does not put limitations on model dimensions (such as number of attention heads, hidden sizes, and others), batch size, or any other training parameters. For models of up to 13 billion parameters, you can use ZeRO-powered data parallelism conveniently without requiring model parallelism, while in contrast, standard data parallelism will run out of memory for models with more than 1.4 billion parameters. In addition, DeepSpeed conveniently supports flexible combination of ZeRO-powered data parallelism with custom model parallelisms, such as tensor slicing of NVIDIA's Megatron-LM.
-
-
-## Features
-
-Below we provide a brief feature list, see our detailed [feature overview](https://www.deepspeed.ai/features/) for descriptions and usage.
-
-* [Distributed Training with Mixed Precision](https://www.deepspeed.ai/features/#distributed-training-with-mixed-precision)
-  * 16-bit mixed precision
-  * Single-GPU/Multi-GPU/Multi-Node
-* [Model Parallelism](https://www.deepspeed.ai/features/#model-parallelism)
-  * Support for Custom Model Parallelism
-  * Integration with Megatron-LM
-* [Pipeline Parallelism](https://www.deepspeed.ai/tutorials/pipeline/)
-  * 3D Parallelism
-* [The Zero Redundancy Optimizer](https://www.deepspeed.ai/tutorials/zero/)
-  * Optimizer State and Gradient Partitioning
-  * Activation Partitioning
-  * Constant Buffer Optimization
-  * Contiguous Memory Optimization
-* [ZeRO-Offload](https://www.deepspeed.ai/tutorials/zero-offload/)
-  * Leverage both CPU/GPU memory for model training
-  * Support 10B model training on a single GPU
-* [Ultra-fast dense transformer kernels](https://www.deepspeed.ai/2020/05/18/bert-record.html)
-* [Sparse attention](https://www.deepspeed.ai/2020/09/08/sparse-attention-news.html)
-  * Memory- and compute-efficient sparse kernels
-  * Support 10x long sequences than dense
-  * Flexible support to different sparse structures
-* [1-bit Adam](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html), [0/1 Adam](https://www.deepspeed.ai/tutorials/zero-one-adam/) and [1-bit LAMB](https://www.deepspeed.ai/tutorials/onebit-lamb/)
-  * Custom communication collective
-  * Up to 26x communication volume saving
-* [Additional Memory and Bandwidth Optimizations](https://www.deepspeed.ai/features/#additional-memory-and-bandwidth-optimizations)
-  * Smart Gradient Accumulation
-  * Communication/Computation Overlap
-* [Training Features](https://www.deepspeed.ai/features/#training-features)
-  * Simplified training API
-  * Gradient Clipping
-  * Automatic loss scaling with mixed precision
-* [Training Optimizers](https://www.deepspeed.ai/features/#training-optimizers)
-  * Fused Adam optimizer and arbitrary `torch.optim.Optimizer`
-  * Memory bandwidth optimized FP16 Optimizer
-  * Large Batch Training with LAMB Optimizer
-  * Memory efficient Training with ZeRO Optimizer
-  * CPU-Adam
-* [Training Agnostic Checkpointing](https://www.deepspeed.ai/features/#training-agnostic-checkpointing)
-* [Advanced Parameter Search](https://www.deepspeed.ai/features/#advanced-parameter-search)
-  * Learning Rate Range Test
-  * 1Cycle Learning Rate Schedule
-* [Simplified Data Loader](https://www.deepspeed.ai/features/#simplified-data-loader)
-* [Curriculum Learning](https://www.deepspeed.ai/tutorials/curriculum-learning/)
-  * A curriculum learning-based data pipeline that presents easier or simpler examples earlier during training
-  * Stable and 3.3x faster GPT-2 pre-training with 8x/4x larger batch size/learning rate while maintaining token-wise convergence speed
-  * Complementary to many other DeepSpeed features
-* [Progressive Layer Dropping](https://www.deepspeed.ai/2020/10/28/progressive-layer-dropping-news.html)
-  * Efficient and robust compressed training
-  * Up to 2.5x convergence speedup for pre-training
-* [Performance Analysis and Debugging](https://www.deepspeed.ai/features/#performance-analysis-and-debugging)
-* [Mixture of Experts (MoE)](https://www.deepspeed.ai/tutorials/mixture-of-experts/)
 
 # Contributing
 DeepSpeed welcomes your contributions! Please see our
@@ -238,13 +110,19 @@ comments.
 1. Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He. (2019) ZeRO: memory optimizations toward training trillion parameter models. [arXiv:1910.02054](https://arxiv.org/abs/1910.02054) and [In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC '20)](https://dl.acm.org/doi/10.5555/3433701.3433727).
 2. Jeff Rasley, Samyam Rajbhandari, Olatunji Ruwase, and Yuxiong He. (2020) DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters. [In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD '20, Tutorial)](https://dl.acm.org/doi/10.1145/3394486.3406703).
 3. Minjia Zhang, Yuxiong He. (2020) Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. [arXiv:2010.13369](https://arxiv.org/abs/2010.13369) and [NeurIPS 2020](https://proceedings.neurips.cc/paper/2020/hash/a1140a3d0df1c81e24ae954d935e8926-Abstract.html).
-4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840).
+4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840) and [USENIX ATC 2021](https://www.usenix.org/conference/atc21/presentation/ren-jie).
 5. Hanlin Tang, Shaoduo Gan, Ammar Ahmad Awan, Samyam Rajbhandari, Conglong Li, Xiangru Lian, Ji Liu, Ce Zhang, Yuxiong He. (2021) 1-bit Adam: Communication Efficient Large-Scale Training with Adam's Convergence Speed. [arXiv:2102.02888](https://arxiv.org/abs/2102.02888) and [ICML 2021](http://proceedings.mlr.press/v139/tang21a.html).
-6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857).
-7. Conglong Li, Ammar Ahmad Awan, Hanlin Tang, Samyam Rajbhandari, Yuxiong He. (2021) 1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed. [arXiv:2104.06069](https://arxiv.org/abs/2104.06069).
-8. Conglong Li, Minjia Zhang, Yuxiong He. (2021) Curriculum Learning: A Regularization Method for Efficient and Stable Billion-Scale GPT Model Pre-Training. [arXiv:2108.06084](https://arxiv.org/abs/2108.06084).
+6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857) and [SC 2021](https://dl.acm.org/doi/abs/10.1145/3458817.3476205).
+7. Conglong Li, Ammar Ahmad Awan, Hanlin Tang, Samyam Rajbhandari, Yuxiong He. (2021) 1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed. [arXiv:2104.06069](https://arxiv.org/abs/2104.06069) and [HiPC 2022](https://hipc.org/advance-program/).
+8. Conglong Li, Minjia Zhang, Yuxiong He. (2021) The Stability-Efficiency Dilemma: Investigating Sequence Length Warmup for Training GPT Models. [arXiv:2108.06084](https://arxiv.org/abs/2108.06084) and [NeurIPS 2022](https://openreview.net/forum?id=JpZ5du_Kdh).
 9. Yucheng Lu, Conglong Li, Minjia Zhang, Christopher De Sa, Yuxiong He. (2022) Maximizing Communication Efficiency for Large-scale Training via 0/1 Adam. [arXiv:2202.06009](https://arxiv.org/abs/2202.06009).
-10. Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He. (2022) DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale [arXiv:2201.05596](https://arxiv.org/abs/2201.05596).
+10. Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He. (2022) DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale [arXiv:2201.05596](https://arxiv.org/abs/2201.05596) and [ICML 2022](https://proceedings.mlr.press/v162/rajbhandari22a.html).
+11. Shaden Smith, Mostofa Patwary, Brandon Norick, Patrick LeGresley, Samyam Rajbhandari, Jared Casper, Zhun Liu, Shrimai Prabhumoye, George Zerveas, Vijay Korthikanti, Elton Zhang, Rewon Child, Reza Yazdani Aminabadi, Julie Bernauer, Xia Song, Mohammad Shoeybi, Yuxiong He, Michael Houston, Saurabh Tiwary, Bryan Catanzaro. (2022) Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model [arXiv:2201.11990](https://arxiv.org/abs/2201.11990).
+12. Xiaoxia Wu, Zhewei Yao, Minjia Zhang, Conglong Li, Yuxiong He. (2022) Extreme Compression for Pre-trained Transformers Made Simple and Efficient. [arXiv:2206.01859](https://arxiv.org/abs/2206.01859) and [NeurIPS 2022](https://openreview.net/forum?id=xNeAhc2CNAl).
+13. Zhewei Yao, Reza Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, Yuxiong He. (2022) ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers. [arXiv:2206.01861](https://arxiv.org/abs/2206.01861) and [NeurIPS 2022](https://openreview.net/forum?id=f-fVCElZ-G1).
+14. Reza Yazdani Aminabadi, Samyam Rajbhandari, Minjia Zhang, Ammar Ahmad Awan, Cheng Li, Du Li, Elton Zheng, Jeff Rasley, Shaden Smith, Olatunji Ruwase, Yuxiong He. (2022) DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale. [arXiv:2207.00032](https://arxiv.org/abs/2207.00032) and [SC 2022](https://dl.acm.org/doi/abs/10.5555/3571885.3571946).
+15. Zhewei Yao, Xiaoxia Wu, Conglong Li, Connor Holmes, Minjia Zhang, Cheng Li, Yuxiong He. (2022) Random-LTD: Random and Layerwise Token Dropping Brings Efficient Training for Large-scale Transformers. [arXiv:2211.11586](https://arxiv.org/abs/2211.11586).
+16. Conglong Li, Zhewei Yao, Xiaoxia Wu, Minjia Zhang, Yuxiong He. (2022) DeepSpeed Data Efficiency: Improving Deep Learning Model Quality and Training Efficiency via Efficient Data Sampling and Routing. [arXiv:2212.03597](https://arxiv.org/abs/2212.03597).
 
 # Videos
 1. DeepSpeed KDD 2020 Tutorial
@@ -258,3 +136,7 @@ comments.
     * Registration is free and all videos are available on-demand.
     * [ZeRO & Fastest BERT: Increasing the scale and speed of deep learning training in DeepSpeed](https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html).
 3. [DeepSpeed on AzureML](https://youtu.be/yBVXR8G8Bg8)
+4. Community Tutorials
+    * [DeepSpeed: All the tricks to scale to gigantic models (Mark Saroufim)](https://www.youtube.com/watch?v=pDGI668pNg0)
+    * [Turing-NLG, DeepSpeed and the ZeRO optimizer (Yannic Kilcher)](https://www.youtube.com/watch?v=tC01FRB0M7w)
+    * [Ultimate Guide To Scaling ML Models (The AI Epiphany)](https://www.youtube.com/watch?v=hc0u4avAkuM)
diff --git a/env.sh b/env.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a632541512d7a009605a680e9265d501812fb244
--- /dev/null
+++ b/env.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+export LD_LIBRARY_PATH=/public/home/aishsh/env/anaconda3/envs/oneflow_test/lib/:$LD_LIBRARY_PATH
+export LIBRARY_PATH=/public/home/aishsh/env/anaconda3/envs/oneflow_test/lib/:$LIBRARY_PATH
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c61ee047df9aed1391d98fbc665aae41c75469f8
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,9 @@
+# DeepSpeed Examples
+
+If you are looking for examples using DeepSpeed please see the following resources:
+
+1. [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)
+2. [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed)
+3. [DeepSpeed + AzureML](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed)
+4. [DeepSpeed + Hugging Face Transformers Integration](https://huggingface.co/docs/transformers/main_classes/deepspeed)
+5. [DeepSpeed + PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.utilities.deepspeed.html)
diff --git a/inference/test_checkpoint_sharding.py b/inference/test_checkpoint_sharding.py
new file mode 100644
index 0000000000000000000000000000000000000000..09fb023072f98fa671616fde876d4b2fe7f0fc8a
--- /dev/null
+++ b/inference/test_checkpoint_sharding.py
@@ -0,0 +1,96 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
+import pytest
+import torch
+import deepspeed
+from deepspeed.model_implementations import DeepSpeedTransformerInference
+from unit.common import DistributedTest, DistributedFixture
+#from deepspeed.ops.transformers import AutoConfig, AutoModelForCausalLM
+#from deepspeed.ops.transformers import AutoConfig, AutoModelForCausalLM
+from ../ops.transformers import AutoConfig, AutoModelForCausalLM
+
+
+def check_dtype(model, expected_dtype):
+    def find_dtype(module):
+        for child in module.children():
+            if isinstance(child, DeepSpeedTransformerInference):
+                return child.attention.attn_qkvw.dtype
+            else:
+                found_dtype = find_dtype(child)
+                if found_dtype:
+                    return found_dtype
+
+    found_dtype = find_dtype(model)
+    assert found_dtype, "Did not find DeepSpeedTransformerInference in model"
+    assert (
+        found_dtype == expected_dtype
+    ), f"Expected transformer dtype {expected_dtype}, but found {found_dtype}"
+
+
+@pytest.fixture(params=[
+    "bigscience/bloom-560m",
+    "EleutherAI/gpt-j-6B",
+    "EleutherAI/gpt-neo-125M",
+    "facebook/opt-125m"
+])
+def model_name(request):
+    return request.param
+
+
+@pytest.fixture(params=[torch.float16, torch.int8], ids=["fp16", "int8"])
+def dtype(request):
+    return request.param
+
+
+class save_shard(DistributedFixture):
+    world_size = 2
+
+    def run(self, model_name, class_tmpdir):
+        # Only write a checkpoint if one does not exist
+        if not os.path.isdir(os.path.join(class_tmpdir, model_name)):
+            world_size = int(os.getenv("WORLD_SIZE", "1"))
+            inf_config = {
+                "replace_with_kernel_inject": True,
+                "dtype": torch.float16,
+                "enable_cuda_graph": False,
+                "tensor_parallel": {
+                    "tp_size": world_size
+                },
+                "save_mp_checkpoint_path": os.path.join(str(class_tmpdir),
+                                                        model_name),
+            }
+
+            # Load model and save sharded checkpoint
+            model = AutoModelForCausalLM.from_pretrained(model_name,
+                                                         torch_dtype=torch.float16)
+            model = deepspeed.init_inference(model, config=inf_config)
+
+
+@pytest.mark.seq_inference
+class TestCheckpointShard(DistributedTest):
+    world_size = 2
+
+    def test(self, model_name, dtype, class_tmpdir, save_shard):
+        world_size = int(os.getenv("WORLD_SIZE", "1"))
+        inf_config = {
+            "replace_with_kernel_inject": True,
+            "dtype": dtype,
+            "enable_cuda_graph": False,
+            "tensor_parallel": {
+                "tp_size": world_size
+            },
+            "checkpoint": os.path.join(class_tmpdir,
+                                       model_name,
+                                       "ds_inference_config.json"),
+        }
+
+        # Load model on meta tensors
+        model_config = AutoConfig.from_pretrained(model_name)
+        # Note that we use half precision to load initially, even for int8
+        with deepspeed.OnDevice(dtype=torch.float16, device="meta"):
+            model = AutoModelForCausalLM.from_config(model_config,
+                                                     torch_dtype=torch.bfloat16)
+        model = model.eval()
+        model = deepspeed.init_inference(model, config=inf_config)
+        check_dtype(model, dtype)
diff --git a/inference/test_inference.py b/inference/test_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..371ecda710b29e04e42578745e4484e32f01ad15
--- /dev/null
+++ b/inference/test_inference.py
@@ -0,0 +1,560 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
+import time
+import torch
+import pytest
+import itertools
+import deepspeed
+from deepspeed.git_version_info import torch_info
+from unit.common import DistributedTest
+from packaging import version as pkg_version
+from deepspeed.ops.op_builder import OpBuilder
+from transformers import pipeline
+from transformers.models.t5.modeling_t5 import T5Block
+from transformers.models.roberta.modeling_roberta import RobertaLayer
+from huggingface_hub import HfApi
+from deepspeed.model_implementations import DeepSpeedTransformerInference
+from torch import nn
+from deepspeed.accelerator import get_accelerator
+
+rocm_version = OpBuilder.installed_rocm_version()
+if rocm_version != (0, 0):
+    pytest.skip("skip inference tests on rocm for now", allow_module_level=True)
+
+_bert_models = [
+    "bert-base-cased",
+    "bert-base-uncased",
+    "bert-large-cased",
+    "bert-large-uncased",
+    "bert-base-multilingual-cased",
+    "bert-base-multilingual-uncased",
+    "deepset/minilm-uncased-squad2",
+    "cross-encoder/ms-marco-MiniLM-L-12-v2",
+    "dslim/bert-base-NER",
+    "bert-large-uncased-whole-word-masking-finetuned-squad",
+    "distilbert-base-cased-distilled-squad",
+]
+_roberta_models = [
+    "roberta-large",
+    "roberta-base",
+    "deepset/roberta-base-squad2",
+    "j-hartmann/emotion-english-distilroberta-base",
+    "Jean-Baptiste/roberta-large-ner-english",
+]
+_gpt_models = [
+    "gpt2",
+    "distilgpt2",
+    "Norod78/hebrew-bad_wiki-gpt_neo-tiny",
+    #"EleutherAI/gpt-j-6B", # Removed as this is causing OOM errors randomly
+    "bigscience/bloom-560m",
+]
+_opt_models = [
+    "facebook/opt-125m",        # 125m, 1.7B, ..., 175B variants have the same model architecture.
+    "facebook/opt-350m",        # 350m applies layer norm after attnention layer which is different than other variants.
+]
+_all_models = HfApi().list_models()
+
+test_models = set(_bert_models + _roberta_models + _gpt_models + _opt_models)
+test_tasks = [
+    "fill-mask",
+    "question-answering",
+    "text-classification",
+    "token-classification",
+    "text-generation",
+    "text2text-generation",
+    "summarization",
+    "translation"
+]
+pytest.all_models = {
+    task: [m.modelId for m in _all_models if m.pipeline_tag == task]
+    for task in test_tasks
+}
+
+_model_w_tasks = itertools.product(*[test_models, test_tasks])
+
+
+def _valid_model_task(model_task):
+    m, t = model_task
+    return m in pytest.all_models[t]
+
+
+pytest.models_w_tasks = list(filter(_valid_model_task, _model_w_tasks))
+pytest.mt_names = [f"{m}-{t}" for m, t in pytest.models_w_tasks]
+"""
+These fixtures iterate all combinations of tasks and models, dtype, & cuda_graph
+"""
+
+
+@pytest.fixture(params=pytest.models_w_tasks, ids=pytest.mt_names)
+def model_w_task(request):
+    return request.param
+
+
+@pytest.fixture(params=[torch.float, torch.half], ids=["fp32", "fp16"])
+def dtype(request):
+    return request.param
+
+
+@pytest.fixture(params=[True, False], ids=["CG", "noCG"])
+def enable_cuda_graph(request):
+    return request.param
+
+
+"""
+This fixture will validate the configuration
+"""
+
+
+@pytest.fixture()
+def invalid_model_task_config(model_w_task, dtype, enable_cuda_graph):
+    model, task = model_w_task
+    msg = ""
+    if pkg_version.parse(torch.__version__) <= pkg_version.parse("1.2"):
+        msg = "DS inference injection doesn't work well on older torch versions"
+    elif model not in pytest.all_models[task]:
+        msg = f"Not a valid model / task combination: {model} / {task}"
+    elif enable_cuda_graph and (torch_info["cuda_version"] == "0.0"):
+        msg = "CUDA not detected, cannot use CUDA Graph"
+    elif enable_cuda_graph and pkg_version.parse(
+            torch.__version__) < pkg_version.parse("1.10"):
+        msg = "CUDA Graph is only available in torch versions >= 1.10"
+    elif "gpt-j-6B" in model:
+        if dtype != torch.half:
+            msg = f"Not enough GPU memory to run {model} with dtype {dtype}"
+        elif enable_cuda_graph:
+            msg = f"Not enough GPU memory to run {model} with CUDA Graph enabled"
+    elif "gpt-neox-20b" in model:  # TODO: remove this when neox issues resolved
+        msg = "Skipping gpt-neox-20b for now"
+    elif ("gpt-neox-20b" in model) and (dtype != torch.half):
+        msg = f"Not enough GPU memory to run {model} with dtype {dtype}"
+    elif ("bloom" in model) and (dtype != torch.half):
+        msg = f"Bloom models only support half precision, cannot use dtype {dtype}"
+    elif ("bert" not in model.lower()) and enable_cuda_graph:
+        msg = "Non bert/roberta models do no support CUDA Graph"
+    return msg
+
+
+"""
+These fixtures can be used to customize the query, inference args, and assert
+statement for each combination of model /task
+"""
+
+
+@pytest.fixture
+def query(model_w_task):
+    model, task = model_w_task
+    angle_bracket_mask_models = [
+        "roberta",
+        "camembert",
+        "esm",
+        "ibert",
+        "luke",
+        "mpnet",
+        "yoso",
+        "mpnet"
+    ]
+
+    if task == "fill-mask":
+        if any(map(lambda x: x in model, angle_bracket_mask_models)):
+            return "Hello I'm a <mask> model."
+        else:
+            return "Hell I'm a [MASK] model."
+    elif task == "question-answering":
+        return {
+            "question": "What's my name?",
+            "context": "My name is Clara and I live in Berkeley",
+        }
+    elif task == "text-classification":
+        return "DeepSpeed is the greatest"
+    elif task == "token-classification":
+        return "My name is jean-baptiste and I live in montreal."
+    elif task == "text-generation":
+        return "DeepSpeed is the greatest"
+    elif task == "text2text-generation":
+        return "Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy"
+    elif task == "translation" or task == "summarization":
+        return "Hello, my dog is cute"
+    else:
+        NotImplementedError(f'query for task "{task}" is not implemented')
+
+
+@pytest.fixture
+def inf_kwargs(model_w_task):
+    model, task = model_w_task
+    if task == "text-generation":
+        if model == "EleutherAI/gpt-j-6B":
+            # This model on V100 is hitting memory problems that limit the number of output tokens
+            return {"do_sample": False, "max_length": 12}
+        return {"do_sample": False, "max_length": 20}
+    else:
+        return {}
+
+
+def fill_mask_assert(x, y):
+    return set(res["token_str"] for res in x) == set(res["token_str"] for res in y)
+
+
+def question_answering_assert(x, y):
+    return x["answer"] == y["answer"]
+
+
+def text_classification_assert(x, y):
+    return set(res["label"] for res in x) == set(res["label"] for res in y)
+
+
+def token_classification_assert(x, y):
+    return set(ent["word"] for ent in x) == set(ent["word"] for ent in y)
+
+
+def text_generation_assert(x, y):
+    return set(res["generated_text"] for res in x) == set(res["generated_text"]
+                                                          for res in y)
+
+
+def text2text_generation_assert(x, y):
+    return set(res["generated_text"] for res in x) == set(res["generated_text"]
+                                                          for res in y)
+
+
+def translation_assert(x, y):
+    return set(res["translation_text"] for res in x) == set(res["translation_text"]
+                                                            for res in y)
+
+
+def summarization_assert(x, y):
+    return set(res["summary_text"] for res in x) == set(res["summary_text"] for res in y)
+
+
+@pytest.fixture
+def assert_fn(model_w_task):
+    model, task = model_w_task
+    assert_fn_dict = {
+        "fill-mask": fill_mask_assert,
+        "question-answering": question_answering_assert,
+        "text-classification": text_classification_assert,
+        "token-classification": token_classification_assert,
+        "text-generation": text_generation_assert,
+        "text2text-generation": text2text_generation_assert,
+        "translation": translation_assert,
+        "summarization": summarization_assert
+    }
+    assert_fn = assert_fn_dict.get(task, None)
+    if assert_fn is None:
+        NotImplementedError(f'assert_fn for task "{task}" is not implemented')
+    return assert_fn
+
+
+def check_injection(model):
+    def verify_injection(module):
+        for child in module.children():
+            if isinstance(child, nn.ModuleList):
+                assert isinstance(child[0], DeepSpeedTransformerInference),\
+                    "DeepSpeed-Inference Transformer kernels has not been injected in the model"
+                break
+            else:
+                verify_injection(child)
+
+    verify_injection(model)
+
+
+"""
+Tests
+"""
+
+
+@pytest.mark.inference
+class TestModelTask(DistributedTest):
+    world_size = 1
+
+    def test(
+        self,
+        model_w_task,
+        dtype,
+        enable_cuda_graph,
+        query,
+        inf_kwargs,
+        assert_fn,
+        invalid_model_task_config,
+    ):
+        if invalid_model_task_config:
+            pytest.skip(invalid_model_task_config)
+
+        model, task = model_w_task
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+
+        # Load the model on CPU first to avoid OOM for large models @fp32
+        pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt")
+        if dtype == torch.half:
+            pipe.model.half()
+
+        # Switch device to GPU after converting to half
+        device = torch.device(get_accelerator().device_name(local_rank))
+        pipe.device = device
+        pipe.model.to(device)
+
+        # Warm-up queries for perf measurement
+        #for i in range(10):
+        #    _ = pipe(query, **inf_kwargs)
+        get_accelerator().synchronize()
+        start = time.time()
+        bs_output = pipe(query, **inf_kwargs)
+        get_accelerator().synchronize()
+        bs_time = time.time() - start
+
+        pipe.model = deepspeed.init_inference(
+            pipe.model,
+            mp_size=1,
+            dtype=dtype,
+            replace_with_kernel_inject=True,
+            enable_cuda_graph=enable_cuda_graph,
+        )
+        check_injection(pipe.model)
+        # Warm-up queries for perf measurement
+        #for i in range(10):
+        #    _ = pipe(query, **inf_kwargs)
+        get_accelerator().synchronize()
+        start = time.time()
+        ds_output = pipe(query, **inf_kwargs)
+        get_accelerator().synchronize()
+        ds_time = time.time() - start
+
+        # facebook/opt* and some bigscient/bloom* models are not matching
+        # baseline exactly, adding an exception to them for now
+        if ("opt" in model) or ("bloom" in model):
+            bs_output = pipe(query, **inf_kwargs)
+
+        # These performance tests are only measuring the time for a single
+        # inference request, we just want to check that performance isn't terrible
+        #assert ds_time <= (bs_time * 1.1)
+        assert assert_fn(bs_output, ds_output)
+
+
+@pytest.mark.seq_inference
+@pytest.mark.parametrize("model_w_task",
+                         [("EleutherAI/gpt-neo-1.3B",
+                           "text-generation"),
+                          ("EleutherAI/gpt-neox-20b",
+                           "text-generation"),
+                          ("bigscience/bloom-3b",
+                           "text-generation"),
+                          ("EleutherAI/gpt-j-6B",
+                           "text-generation")],
+                         ids=["gpt-neo",
+                              "gpt-neox",
+                              "bloom",
+                              "gpt-j"])
+class TestMPSize(DistributedTest):
+    world_size = 4
+
+    def test(
+        self,
+        model_w_task,
+        dtype,
+        query,
+        inf_kwargs,
+        assert_fn,
+        invalid_model_task_config,
+    ):
+        if invalid_model_task_config:
+            pytest.skip(invalid_model_task_config)
+
+        model, task = model_w_task
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+
+        # We have to load these large models on CPU with pipeline because not
+        # enough GPU memory
+        pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt")
+        bs_output = pipe(query, **inf_kwargs)
+
+        pipe.model = deepspeed.init_inference(pipe.model,
+                                              mp_size=self.world_size,
+                                              dtype=dtype,
+                                              replace_with_kernel_inject=True)
+        check_injection(pipe.model)
+        # Switch device to GPU so that input tensors are not on CPU
+        pipe.device = torch.device(get_accelerator().device_name(local_rank))
+        ds_output = pipe(query, **inf_kwargs)
+
+        print(local_rank, "baseline", bs_output)
+        print(local_rank, "deepspeed", ds_output)
+        assert assert_fn(bs_output, ds_output)
+
+
+@pytest.mark.seq_inference
+@pytest.mark.parametrize(
+    "model_w_task, injection_policy",
+    [
+        (("google/t5-v1_1-small",
+          "text2text-generation"),
+         {
+             T5Block: ('SelfAttention.o',
+                       'EncDecAttention.o',
+                       'DenseReluDense.wo')
+         }),
+        (("roberta-large",
+          "fill-mask"),
+         {
+             RobertaLayer: ('output.dense')
+         }),
+    ],
+    ids=["t5",
+         "roberta"],
+)
+@pytest.mark.parametrize("dtype", [torch.float], ids=["fp32"])
+@pytest.mark.parametrize("enable_cuda_graph", [False], ids=["noCG"])
+class TestInjectionPolicy(DistributedTest):
+    world_size = [1, 2]
+
+    def test(
+        self,
+        model_w_task,
+        injection_policy,
+        query,
+        inf_kwargs,
+        assert_fn,
+        invalid_model_task_config,
+        dtype,
+        enable_cuda_graph,
+    ):
+        if invalid_model_task_config:
+            pytest.skip(invalid_model_task_config)
+
+        model, task = model_w_task
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        world_size = int(os.getenv("WORLD_SIZE", "2"))
+
+        # We have to load these large models on CPU with pipeline because not
+        # enough GPU memory
+        pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt")
+        bs_output = pipe(query, **inf_kwargs)
+
+        pipe.model = deepspeed.init_inference(pipe.model,
+                                              mp_size=world_size,
+                                              dtype=dtype,
+                                              injection_policy=injection_policy)
+        # Switch device to GPU so that input tensors are not on CPU
+        pipe.device = torch.device(get_accelerator().device_name(local_rank))
+        ds_output = pipe(query, **inf_kwargs)
+
+        print(local_rank, "baseline", bs_output)
+        print(local_rank, "deepspeed", ds_output)
+        assert assert_fn(bs_output, ds_output)
+
+
+@pytest.mark.seq_inference
+@pytest.mark.parametrize(
+    "model_w_task",
+    [
+        ("Helsinki-NLP/opus-mt-en-de",
+         "translation"),
+    ],
+    ids=[
+        "marian",
+    ],
+)
+@pytest.mark.parametrize("dtype", [torch.float16], ids=["fp16"])
+@pytest.mark.parametrize("enable_cuda_graph", [False], ids=["noCG"])
+class TestAutoTensorParallelism(DistributedTest):
+    world_size = [2]
+
+    def test(
+        self,
+        model_w_task,
+        query,
+        inf_kwargs,
+        assert_fn,
+        invalid_model_task_config,
+        dtype,
+        enable_cuda_graph,
+    ):
+        if invalid_model_task_config:
+            pytest.skip(invalid_model_task_config)
+
+        model, task = model_w_task
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        world_size = int(os.getenv("WORLD_SIZE", "2"))
+
+        # We have to load these large models on CPU with pipeline because not
+        # enough GPU memory
+        pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt")
+        bs_output = pipe(query, **inf_kwargs)
+
+        pipe.model = deepspeed.init_inference(pipe.model,
+                                              mp_size=world_size,
+                                              dtype=dtype)
+        # Switch device to GPU so that input tensors are not on CPU
+        pipe.device = torch.device(get_accelerator().device_name(local_rank))
+        ds_output = pipe(query, **inf_kwargs)
+
+        print(local_rank, "baseline", bs_output)
+        print(local_rank, "deepspeed", ds_output)
+        assert assert_fn(bs_output, ds_output)
+
+
+@pytest.mark.nightly
+@pytest.mark.parametrize(
+    "model_family, model_name",
+    (
+        ["gpt2",
+         "EleutherAI/gpt-neo-2.7B"],
+        ["gpt2",
+         "EleutherAI/gpt-j-6B"],
+        ["gpt2",
+         "gpt2-xl"],
+    ),
+)
+@pytest.mark.parametrize("task", ["lambada_standard"])
+class TestLMCorrectness(DistributedTest):
+    world_size = 1
+
+    def test(self, model_family, model_name, task):
+        # imports here to avoid import errors when pytest collects tests
+        import lm_eval
+        import lm_eval.models
+        import lm_eval.tasks
+        import lm_eval.evaluator
+
+        local_rank = os.getenv("LOCAL_RANK", "0")
+        device = torch.device(get_accelerator().device_name(local_rank))
+        dtype = torch.float
+        task_dict = lm_eval.tasks.get_task_dict([task])
+
+        if 'gpt-j-6B' in model_name:
+            dtype = torch.half
+            lm = lm_eval.models.get_model(model_family).create_from_arg_string(
+                f"pretrained={model_name}",
+                {"device": "cpu"})
+            setattr(lm, model_family, getattr(lm, model_family).half().to(device))
+            lm._device = device
+        else:
+            lm = lm_eval.models.get_model(model_family).create_from_arg_string(
+                f"pretrained={model_name}",
+                {"device": get_accelerator().device_name()})
+
+        get_accelerator().synchronize()
+        start = time.time()
+        bs_output = lm_eval.evaluator.evaluate(lm=lm, task_dict=task_dict)
+        get_accelerator().synchronize()
+        bs_time = time.time() - start
+
+        ds_model = deepspeed.init_inference(
+            getattr(lm,
+                    model_family),
+            mp_size=1,
+            dtype=dtype,
+            replace_with_kernel_inject=True,
+            enable_cuda_graph=False,
+        )
+        check_injection(ds_model)
+        setattr(lm, model_family, ds_model)
+        get_accelerator().synchronize()
+        start = time.time()
+        ds_output = lm_eval.evaluator.evaluate(lm=lm, task_dict=task_dict)
+        get_accelerator().synchronize()
+        ds_time = time.time() - start
+
+        ppl_diff = abs(bs_output["results"][task]["ppl"] -
+                       ds_output["results"][task]["ppl"])
+        #assert ds_time <= bs_time
+        assert ppl_diff < 0.01
diff --git a/inference/test_inference_config.py b/inference/test_inference_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..e19f73ea35d6e4dea973255c90e76241a957f2f5
--- /dev/null
+++ b/inference/test_inference_config.py
@@ -0,0 +1,41 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import pytest
+import torch
+import deepspeed
+from unit.common import DistributedTest
+from unit.simple_model import create_config_from_dict
+
+
+@pytest.mark.inference
+class TestInferenceConfig(DistributedTest):
+    world_size = 1
+
+    def test_overlap_kwargs(self):
+        config = {"replace_with_kernel_inject": True}
+        kwargs = {"replace_with_kernel_inject": True}
+
+        engine = deepspeed.init_inference(torch.nn.Module(), config=config, **kwargs)
+        assert engine._config.replace_with_kernel_inject
+
+    def test_overlap_kwargs_conflict(self):
+        config = {"replace_with_kernel_inject": True}
+        kwargs = {"replace_with_kernel_inject": False}
+
+        with pytest.raises(ValueError):
+            engine = deepspeed.init_inference(torch.nn.Module(), config=config, **kwargs)
+
+    def test_kwargs_and_config(self):
+        config = {"replace_with_kernel_inject": True}
+        kwargs = {"dtype": torch.float32}
+
+        engine = deepspeed.init_inference(torch.nn.Module(), config=config, **kwargs)
+        assert engine._config.replace_with_kernel_inject
+        assert engine._config.dtype == kwargs["dtype"]
+
+    def test_json_config(self, tmpdir):
+        config = {"replace_with_kernel_inject": True}
+        config_json = create_config_from_dict(tmpdir, config)
+
+        engine = deepspeed.init_inference(torch.nn.Module(), config=config_json)
+        assert engine._config.replace_with_kernel_inject
diff --git a/inference/test_model_profiling.py b/inference/test_model_profiling.py
new file mode 100644
index 0000000000000000000000000000000000000000..07ce839306a6449be4c5b2953b99ecf99da42bb5
--- /dev/null
+++ b/inference/test_model_profiling.py
@@ -0,0 +1,90 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
+import time
+import pytest
+import torch
+import deepspeed
+from transformers import pipeline
+from unit.common import DistributedTest
+from deepspeed.accelerator import get_accelerator
+
+
+@pytest.fixture
+def query(model, task):
+    if task == "text-generation":
+        return "DeepSpeed is"
+    elif task == "fill-mask":
+        if "roberta" in model:
+            return "I am a <mask> model"
+        else:
+            return "I am a [MASK] model"
+    else:
+        raise NotImplementedError
+
+
+@pytest.fixture
+def inf_kwargs(task):
+    if task == "text-generation":
+        return {"do_sample": False, "min_length": 50, "max_length": 50}
+    else:
+        return {}
+
+
+@pytest.mark.inference
+@pytest.mark.parametrize("model,task",
+                         [
+                             ("bert-base-cased",
+                              "fill-mask"),
+                             ("roberta-base",
+                              "fill-mask"),
+                             ("gpt2",
+                              "text-generation"),
+                             ("facebook/opt-125m",
+                              "text-generation"),
+                             ("bigscience/bloom-560m",
+                              "text-generation"),
+                         ])
+@pytest.mark.parametrize("cuda_graphs", [True, False])
+@pytest.mark.parametrize("use_cuda_events", [True, False])
+class TestModelProfiling(DistributedTest):
+    world_size = 1
+
+    def test(self,
+             model,
+             task,
+             query,
+             inf_kwargs,
+             cuda_graphs,
+             use_cuda_events,
+             dtype=torch.float16):
+        if cuda_graphs and "bert" not in model:
+            pytest.skip(f"CUDA Graph not supported for {model}")
+
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        world_size = int(os.getenv("WORLD_SIZE", "1"))
+
+        pipe = pipeline(task, model, framework="pt", device=local_rank)
+        pipe.model = deepspeed.init_inference(pipe.model,
+                                              dtype=dtype,
+                                              mp_size=world_size,
+                                              replace_with_kernel_inject=True,
+                                              enable_cuda_graph=cuda_graphs)
+        pipe.model.profile_model_time(use_cuda_events=use_cuda_events)
+
+        e2e_times = []
+        model_times = []
+        for _ in range(10):
+            get_accelerator().synchronize()
+            start = time.perf_counter_ns()
+
+            r = pipe(query, **inf_kwargs)
+
+            get_accelerator().synchronize()
+            end = time.perf_counter_ns()
+
+            e2e_times.append((end - start) / 1e6)  # convert ns to ms
+            model_times.extend(pipe.model.model_times())
+
+        for e2e_t, model_t in zip(e2e_times, model_times):
+            assert e2e_t >= model_t
diff --git a/install.sh b/install.sh
old mode 100644
new mode 100755
diff --git a/op_builder/__init__.py b/op_builder/__init__.py
index dcac71011aa8be21d81f2b55f9692fc99d6b1211..51546be6ccaf7b39e969f416b28bb472b8d6442c 100644
--- a/op_builder/__init__.py
+++ b/op_builder/__init__.py
@@ -1,32 +1,50 @@
 """
 Copyright 2020 The Microsoft DeepSpeed Team
 """
-from .cpu_adam import CPUAdamBuilder
-from .cpu_adagrad import CPUAdagradBuilder
-from .fused_adam import FusedAdamBuilder
-from .fused_lamb import FusedLambBuilder
-from .sparse_attn import SparseAttnBuilder
-from .transformer import TransformerBuilder
-from .stochastic_transformer import StochasticTransformerBuilder
-from .utils import UtilsBuilder
-from .async_io import AsyncIOBuilder
-from .transformer_inference import InferenceBuilder
-from .quantizer import QuantizerBuilder
+import sys
+import os
+import pkgutil
+import importlib
+
 from .builder import get_default_compute_capabilities, OpBuilder
 
-# TODO: infer this list instead of hard coded
-# List of all available ops
-__op_builders__ = [
-    CPUAdamBuilder(),
-    CPUAdagradBuilder(),
-    FusedAdamBuilder(),
-    FusedLambBuilder(),
-    SparseAttnBuilder(),
-    TransformerBuilder(),
-    StochasticTransformerBuilder(),
-    AsyncIOBuilder(),
-    UtilsBuilder(),
-    QuantizerBuilder(),
-    InferenceBuilder()
-]
-ALL_OPS = {op.name: op for op in __op_builders__}
+# List of all available op builders from deepspeed op_builder
+try:
+    import deepspeed.ops.op_builder  # noqa: F401
+    op_builder_dir = "deepspeed.ops.op_builder"
+except ImportError:
+    op_builder_dir = "op_builder"
+
+__op_builders__ = []
+
+this_module = sys.modules[__name__]
+
+
+def builder_closure(member_name):
+    if op_builder_dir == "op_builder":
+        # during installation time cannot get builder due to torch not installed,
+        # return closure instead
+        def _builder():
+            from deepspeed.accelerator import get_accelerator
+            builder = get_accelerator().create_op_builder(member_name)
+            return builder
+
+        return _builder
+    else:
+        # during runtime, return op builder class directly
+        from deepspeed.accelerator import get_accelerator
+        builder = get_accelerator().get_op_builder(member_name)
+        return builder
+
+
+# reflect builder names and add builder closure, such as 'TransformerBuilder()' creates op builder wrt current accelerator
+for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(this_module.__file__)]):
+    if module_name != 'all_ops' and module_name != 'builder':
+        module = importlib.import_module(f".{module_name}", package=op_builder_dir)
+        for member_name in module.__dir__():
+            if member_name.endswith(
+                    'Builder'
+            ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder":
+                # assign builder name to variable with same name
+                # the following is equivalent to i.e. TransformerBuilder = "TransformerBuilder"
+                this_module.__dict__[member_name] = builder_closure(member_name)
diff --git a/op_builder/all_ops.py b/op_builder/all_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6010addb60bb5c13aec5a7686297bd709e276dd
--- /dev/null
+++ b/op_builder/all_ops.py
@@ -0,0 +1,30 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+import os
+import pkgutil
+import importlib
+try:
+    # during installation time accelerator is visible, otherwise return deepspeed.accelerator
+    from accelerator import get_accelerator
+except ImportError:
+    from deepspeed.accelerator import get_accelerator
+
+# List of all available ops
+
+# reflect all builder names into __op_builders__
+op_builder_dir = get_accelerator().op_builder_dir()
+op_builder_module = importlib.import_module(op_builder_dir)
+__op_builders__ = []
+
+for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(op_builder_module.__file__)]):
+    # avoid self references
+    if module_name != 'all_ops' and module_name != 'builder':
+        module = importlib.import_module("{}.{}".format(op_builder_dir, module_name))
+        for member_name in module.__dir__():
+            if member_name.endswith('Builder'):
+                # append builder to __op_builders__ list
+                builder = get_accelerator().create_op_builder(member_name)
+                __op_builders__.append(builder)
+
+ALL_OPS = {op.name: op for op in __op_builders__ if op is not None}
diff --git a/op_builder/async_io.py b/op_builder/async_io.py
index aec7911ce96bff070f3b2cc65f5fa58170a2cb86..95cecdbede453d4009594cdae0772492ec98905c 100644
--- a/op_builder/async_io.py
+++ b/op_builder/async_io.py
@@ -26,7 +26,8 @@ class AsyncIOBuilder(OpBuilder):
             'csrc/aio/py_lib/deepspeed_aio_thread.cpp',
             'csrc/aio/common/deepspeed_aio_utils.cpp',
             'csrc/aio/common/deepspeed_aio_common.cpp',
-            'csrc/aio/common/deepspeed_aio_types.cpp'
+            'csrc/aio/common/deepspeed_aio_types.cpp',
+            'csrc/aio/py_lib/deepspeed_pin_tensor.cpp'
         ]
 
     def include_paths(self):
@@ -51,7 +52,9 @@ class AsyncIOBuilder(OpBuilder):
         ]
 
     def extra_ldflags(self):
-        return ['-laio']
+        #aiss
+        #return ['-laio']
+        return ['-laio', '-liomp5']
 
     def check_for_libaio_pkg(self):
         libs = dict(
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 6151fcc6b73a78a259bb19a57565b25b6f9bd943..d818628a0a10332a98d17a8e63b5a42e787ee9c6 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -4,7 +4,6 @@ Copyright 2020 The Microsoft DeepSpeed Team
 import os
 import sys
 import time
-import json
 import importlib
 from pathlib import Path
 import subprocess
@@ -16,6 +15,7 @@ import distutils.log
 import distutils.sysconfig
 from distutils.errors import CompileError, LinkError
 from abc import ABC, abstractmethod
+from typing import List
 
 YELLOW = '\033[93m'
 END = '\033[0m'
@@ -35,7 +35,10 @@ else:
     TORCH_MINOR = int(torch.__version__.split('.')[1])
 
 
-def installed_cuda_version():
+def installed_cuda_version(name=""):
+    import torch.cuda
+    if not torch.cuda.is_available():
+        return 0, 0
     import torch.utils.cpp_extension
     cuda_home = torch.utils.cpp_extension.CUDA_HOME
     assert cuda_home is not None, "CUDA_HOME does not exist, unable to compile CUDA op(s)"
@@ -48,7 +51,6 @@ def installed_cuda_version():
     release = output_split[release_idx + 1].replace(',', '').split(".")
     # Ignore patch versions, only look at major + minor
     cuda_major, cuda_minor = release[:2]
-    installed_cuda_version = ".".join(release[:2])
     return int(cuda_major), int(cuda_minor)
 
 
@@ -73,20 +75,22 @@ cuda_minor_mismatch_ok = {
         "10.1",
         "10.2",
     ],
-    11: [
-        "11.0",
-        "11.1",
-        "11.2",
-        "11.3",
-        "11.4",
-        "11.5",
-        "11.6",
-    ],
+    11: ["11.0",
+         "11.1",
+         "11.2",
+         "11.3",
+         "11.4",
+         "11.5",
+         "11.6",
+         "11.7",
+         "11.8"],
 }
 
 
-def assert_no_cuda_mismatch():
-    cuda_major, cuda_minor = installed_cuda_version()
+def assert_no_cuda_mismatch(name=""):
+    cuda_major, cuda_minor = installed_cuda_version(name)
+    if cuda_minor == 0 and cuda_major == 0:
+        return False
     sys_cuda_version = f'{cuda_major}.{cuda_minor}'
     torch_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
     # This is a show-stopping error, should probably not proceed past this
@@ -97,11 +101,12 @@ def assert_no_cuda_mismatch():
             print(f"Installed CUDA version {sys_cuda_version} does not match the "
                   f"version torch was compiled with {torch.version.cuda} "
                   "but since the APIs are compatible, accepting this combination")
-            return
+            return True
         raise Exception(
-            f"Installed CUDA version {sys_cuda_version} does not match the "
+            f">- DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
             f"version torch was compiled with {torch.version.cuda}, unable to compile "
             "cuda/cpp extensions without a matching cuda version.")
+    return True
 
 
 class OpBuilder(ABC):
@@ -111,6 +116,8 @@ class OpBuilder(ABC):
     def __init__(self, name):
         self.name = name
         self.jit_mode = False
+        self.build_for_cpu = False
+        self.error_log = None
 
     @abstractmethod
     def absolute_name(self):
@@ -131,34 +138,39 @@ class OpBuilder(ABC):
         pass
 
     @staticmethod
-    def assert_torch_info(torch_info):
+    def validate_torch_version(torch_info):
         install_torch_version = torch_info['version']
-        install_cuda_version = torch_info['cuda_version']
-        install_hip_version = torch_info['hip_version']
-
-        if not OpBuilder.is_rocm_pytorch():
-            current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
-        else:
-            current_hip_version = ".".join(torch.version.hip.split('.')[:2])
-
         current_torch_version = ".".join(torch.__version__.split('.')[:2])
+        if install_torch_version != current_torch_version:
+            raise RuntimeError(
+                "PyTorch version mismatch! DeepSpeed ops were compiled and installed "
+                "with a different version than what is being used at runtime. "
+                f"Please re-install DeepSpeed or switch torch versions. "
+                f"Install torch version={install_torch_version}, "
+                f"Runtime torch version={current_torch_version}")
 
+    @staticmethod
+    def validate_torch_op_version(torch_info):
         if not OpBuilder.is_rocm_pytorch():
-            if install_cuda_version != current_cuda_version or install_torch_version != current_torch_version:
+            current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+            install_cuda_version = torch_info['cuda_version']
+            if install_cuda_version != current_cuda_version:
                 raise RuntimeError(
-                    "PyTorch and CUDA version mismatch! DeepSpeed ops were compiled and installed "
-                    "with a different version than what is being used at runtime. Please re-install "
-                    f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
-                    f"torch={install_torch_version}, cuda={install_cuda_version}, runtime versions:"
-                    f"torch={current_torch_version}, cuda={current_cuda_version}")
+                    "CUDA version mismatch! DeepSpeed ops were compiled and installed "
+                    "with a different version than what is being used at runtime. "
+                    f"Please re-install DeepSpeed or switch torch versions. "
+                    f"Install CUDA version={install_cuda_version}, "
+                    f"Runtime CUDA version={current_cuda_version}")
         else:
-            if install_hip_version != current_hip_version or install_torch_version != current_torch_version:
+            current_hip_version = ".".join(torch.version.hip.split('.')[:2])
+            install_hip_version = torch_info['hip_version']
+            if install_hip_version != current_hip_version:
                 raise RuntimeError(
-                    "PyTorch and HIP version mismatch! DeepSpeed ops were compiled and installed "
-                    "with a different version than what is being used at runtime. Please re-install "
-                    f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
-                    f"torch={install_torch_version}, hip={install_hip_version}, runtime versions:"
-                    f"torch={current_torch_version}, hip={current_hip_version}")
+                    "HIP version mismatch! DeepSpeed ops were compiled and installed "
+                    "with a different version than what is being used at runtime. "
+                    f"Please re-install DeepSpeed or switch torch versions. "
+                    f"Install HIP version={install_hip_version}, "
+                    f"Runtime HIP version={current_hip_version}")
 
     @staticmethod
     def is_rocm_pytorch():
@@ -189,12 +201,15 @@ class OpBuilder(ABC):
         ROCM_MINOR = '0'
         if OpBuilder.is_rocm_pytorch():
             from torch.utils.cpp_extension import ROCM_HOME
-            #with open('/opt/rocm/.info/version-dev', 'r') as file:
-        #aiss 20220810
-            ds_build_version = os.getenv('ROCM_PATH', "")
-           
-            with open('{0}/.info/version-dev'.format(ds_build_version), 'r') as file:
-                ROCM_VERSION_DEV_RAW = file.read()
+            rocm_ver_file = Path(ROCM_HOME).joinpath(".info/version-dev")
+            if rocm_ver_file.is_file():
+                with open(rocm_ver_file, 'r') as file:
+                    ROCM_VERSION_DEV_RAW = file.read()
+            elif "rocm" in torch.__version__:
+                ROCM_VERSION_DEV_RAW = torch.__version__.split("rocm")[1]
+            else:
+                assert False, "Could not detect ROCm version"
+            assert ROCM_VERSION_DEV_RAW != "", "Could not detect ROCm version"
             ROCM_MAJOR = ROCM_VERSION_DEV_RAW.split('.')[0]
             ROCM_MINOR = ROCM_VERSION_DEV_RAW.split('.')[1]
         OpBuilder._rocm_version = (int(ROCM_MAJOR), int(ROCM_MINOR))
@@ -225,7 +240,9 @@ class OpBuilder(ABC):
         return True
 
     def extra_ldflags(self):
-        return []
+        #aiss
+        #return []
+        return ['-liomp5']
 
     def libraries_installed(self, libraries):
         valid = False
@@ -353,6 +370,17 @@ class OpBuilder(ABC):
             return '-mcpu=native'
         return '-march=native'
 
+    def is_cuda_enable(self):
+        try:
+            if torch.cuda.is_available():
+                return '-D__ENABLE_CUDA__'
+        except:
+            print(
+                f"{WARNING} {self.name} torch.cuda is missing, only cpu ops can be compiled!"
+            )
+            return '-D__DISABLE_CUDA__'
+        return '-D__DISABLE_CUDA__'
+
     def _backup_cpuinfo(self):
         # Construct cpu_info dict from lscpu that is similar to what py-cpuinfo provides
         if not self.command_exists('lscpu'):
@@ -372,6 +400,8 @@ class OpBuilder(ABC):
             cpu_info['arch'] = 'X86_64'
             if 'avx512' in result:
                 cpu_info['flags'] += 'avx512,'
+            elif 'avx512f' in result:
+                cpu_info['flags'] += 'avx512f,'
             if 'avx2' in result:
                 cpu_info['flags'] += 'avx2'
         elif 'ppc64le' in result:
@@ -398,24 +428,12 @@ class OpBuilder(ABC):
                 return '-D__SCALAR__'
 
         if cpu_info['arch'] == 'X86_64':
-            if 'avx512' in cpu_info['flags']:
+            if 'avx512' in cpu_info['flags'] or 'avx512f' in cpu_info['flags']:
                 return '-D__AVX512__'
             elif 'avx2' in cpu_info['flags']:
                 return '-D__AVX256__'
         return '-D__SCALAR__'
 
-    def python_requirements(self):
-        '''
-        Override if op wants to define special dependencies, otherwise will
-        take self.name and load requirements-<op-name>.txt if it exists.
-        '''
-        path = f'requirements/requirements-{self.name}.txt'
-        requirements = []
-        if os.path.isfile(path):
-            with open(path, 'r') as fd:
-                requirements = [r.strip() for r in fd.readlines()]
-        return requirements
-
     def command_exists(self, cmd):
         if '|' in cmd:
             cmds = cmd.split("|")
@@ -437,6 +455,7 @@ class OpBuilder(ABC):
         return valid
 
     def warning(self, msg):
+        self.error_log = f"{msg}"
         print(f"{WARNING} {msg}")
 
     def deepspeed_src_path(self, code_path):
@@ -455,13 +474,15 @@ class OpBuilder(ABC):
             extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
 
     def load(self, verbose=True):
-        from ...git_version_info import installed_ops, torch_info
+        from deepspeed.git_version_info import installed_ops, torch_info
         if installed_ops[self.name]:
             # Ensure the op we're about to load was compiled with the same
             # torch/cuda versions we are currently using at runtime.
-            if isinstance(self, CUDAOpBuilder):
-                self.assert_torch_info(torch_info)
-
+            self.validate_torch_version(torch_info)
+            if torch.cuda.is_available() and isinstance(self, CUDAOpBuilder):
+#aiss HIP version mismatch error
+                #self.validate_torch_op_version(torch_info)
+                pass   
             return importlib.import_module(self.absolute_name())
         else:
             return self.jit_load(verbose)
@@ -469,28 +490,21 @@ class OpBuilder(ABC):
     def jit_load(self, verbose=True):
         if not self.is_compatible(verbose):
             raise RuntimeError(
-                f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue."
+                f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue. {self.error_log}"
             )
         try:
-            import ninja
+            import ninja  # noqa: F401
         except ImportError:
             raise RuntimeError(
                 f"Unable to JIT load the {self.name} op due to ninja not being installed."
             )
 
         if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
-            assert_no_cuda_mismatch()
+            self.build_for_cpu = not assert_no_cuda_mismatch(self.name)
 
         self.jit_mode = True
         from torch.utils.cpp_extension import load
 
-        # Ensure directory exists to prevent race condition in some cases
-        ext_path = os.path.join(
-            os.environ.get('TORCH_EXTENSIONS_DIR',
-                           DEFAULT_TORCH_EXTENSION_PATH),
-            self.name)
-        os.makedirs(ext_path, exist_ok=True)
-
         start_build = time.time()
         sources = [self.deepspeed_src_path(path) for path in self.sources()]
         extra_include_paths = [
@@ -514,6 +528,7 @@ class OpBuilder(ABC):
             extra_cuda_cflags=self.strip_empty_entries(self.nvcc_args()),
             extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
             verbose=verbose)
+
         build_duration = time.time() - start_build
         if verbose:
             print(f"Time to load {self.name} op: {build_duration} seconds")
@@ -539,7 +554,7 @@ class CUDAOpBuilder(OpBuilder):
         - `TORCH_CUDA_ARCH_LIST` may use ; or whitespace separators. Examples:
 
         TORCH_CUDA_ARCH_LIST="6.1;7.5;8.6" pip install ...
-        TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX" pip install ...
+        TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6+PTX" pip install ...
 
         - `cross_compile_archs` uses ; separator.
 
@@ -569,6 +584,12 @@ class CUDAOpBuilder(OpBuilder):
                     cross_compile_archs = get_default_compute_capabilities()
             ccs = cross_compile_archs.split(';')
 
+        ccs = self.filter_ccs(ccs)
+        if len(ccs) == 0:
+            raise RuntimeError(
+                f"Unable to load {self.name} op due to no compute capabilities remaining after filtering"
+            )
+
         args = []
         for cc in ccs:
             num = cc[0] + cc[2]
@@ -578,6 +599,13 @@ class CUDAOpBuilder(OpBuilder):
 
         return args
 
+    def filter_ccs(self, ccs: List[str]):
+        """
+        Prune any compute capabilities that are not compatible with the builder. Should log
+        which CCs have been pruned.
+        """
+        return ccs
+
     def version_dependent_macros(self):
         # Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456
         version_ge_1_1 = []
@@ -595,18 +623,33 @@ class CUDAOpBuilder(OpBuilder):
         return super().is_compatible(verbose)
 
     def builder(self):
-        from torch.utils.cpp_extension import CUDAExtension
+        #self.build_for_cpu = not assert_no_cuda_mismatch(self.name)
+        #aiss
         if not self.is_rocm_pytorch():
-            assert_no_cuda_mismatch()
-        cuda_ext = CUDAExtension(
+            self.build_for_cpu = not assert_no_cuda_mismatch(self.name) 
+            if self.build_for_cpu:
+                from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
+            else:
+                from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder
+        else:
+            from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder    
+        #self.build_for_cpu = not assert_no_cuda_mismatch(self.name)
+        #if self.build_for_cpu:
+        #    from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
+        #else:
+        #    from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder
+
+        compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())} if self.build_for_cpu else \
+                       {'cxx': self.strip_empty_entries(self.cxx_args()), \
+                           'nvcc': self.strip_empty_entries(self.nvcc_args())}
+
+        cuda_ext = ExtensionBuilder(
             name=self.absolute_name(),
             sources=self.strip_empty_entries(self.sources()),
             include_dirs=self.strip_empty_entries(self.include_paths()),
             libraries=self.strip_empty_entries(self.libraries_args()),
-            extra_compile_args={
-                'cxx': self.strip_empty_entries(self.cxx_args()),
-                'nvcc': self.strip_empty_entries(self.nvcc_args())
-            })
+            extra_compile_args=compile_args)
+
         if self.is_rocm_pytorch():
             # hip converts paths to absolute, this converts back to relative
             sources = cuda_ext.sources
@@ -619,18 +662,46 @@ class CUDAOpBuilder(OpBuilder):
 
     def hipify_extension(self):
         if self.is_rocm_pytorch():
+            #from public.home.aishsh.code.hipify_torch.hipify import hipify_python
             from torch.utils.hipify import hipify_python
+            #hipify_python.hipify(
+            #    project_directory=os.getcwd(),
+            #    output_directory=os.getcwd(),
+            #    header_include_dirs=self.include_paths(),
+            #    includes=[os.path.join(os.getcwd(),
+            #                           '*')] + [os.path.abspath(s) for s in self.sources()],
+            #    extra_files=[os.path.abspath(s) for s in self.sources()],
+            #    show_detailed=True,
+            #    is_pytorch_extension=True,
+            #    hipify_extra_files_only=True,
+            #)
+
             hipify_python.hipify(
                 project_directory=os.getcwd(),
                 output_directory=os.getcwd(),
-                header_include_dirs=self.include_paths(),
+                #header_include_dirs=self.include_paths(),
                 includes=[os.path.join(os.getcwd(),
                                        '*')],
                 extra_files=[os.path.abspath(s) for s in self.sources()],
-                show_detailed=True,
+                show_progress=True,
                 is_pytorch_extension=True,
-                hipify_extra_files_only=True,
+                #hipify_extra_files_only=True,
             )
+#def hipify(
+#    extensions: Iterable = (".cu", ".cuh", ".c", ".cc", ".cpp", ".h", ".in", ".hpp"),
+#    includes: Iterable = (),
+#    extra_files: Iterable = (),
+#    out_of_place_only: bool = False,
+#    ignores: Iterable = (),
+#    show_progress: bool = True,
+#    hip_clang_launch: bool = False,
+#    is_pytorch_extension: bool = False,
+#    clean_ctx: GeneratedFileCleaner = None
+#)
+
+
+
+
 
     def cxx_args(self):
         if sys.platform == "win32":
@@ -639,6 +710,8 @@ class CUDAOpBuilder(OpBuilder):
             return ['-O3', '-std=c++14', '-g', '-Wno-reorder']
 
     def nvcc_args(self):
+        if self.build_for_cpu:
+            return []
         args = ['-O3']
         if self.is_rocm_pytorch():
             ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
@@ -648,11 +721,13 @@ class CUDAOpBuilder(OpBuilder):
                 '-U__HIP_NO_HALF_CONVERSIONS__',
                 '-U__HIP_NO_HALF2_OPERATORS__',
                 '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
-                '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
+                '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR,
+                '--gpu-max-threads-per-block=1024'
             ]
         else:
             cuda_major, _ = installed_cuda_version()
             args += [
+                '-allow-unsupported-compiler' if sys.platform == "win32" else '',
                 '--use_fast_math',
                 '-std=c++17'
                 if sys.platform == "win32" and cuda_major > 10 else '-std=c++14',
@@ -660,43 +735,57 @@ class CUDAOpBuilder(OpBuilder):
                 '-U__CUDA_NO_HALF_CONVERSIONS__',
                 '-U__CUDA_NO_HALF2_OPERATORS__'
             ]
+            if os.environ.get('DS_DEBUG_CUDA_BUILD', '0') == '1':
+                args.append('--ptxas-options=-v')
             args += self.compute_capability_args()
         return args
 
     def libraries_args(self):
+        if self.build_for_cpu:
+            return []
+
         if sys.platform == "win32":
             return ['cublas', 'curand']
         else:
-            return []
+            return ['iomp5']
 
 
 class TorchCPUOpBuilder(CUDAOpBuilder):
     def extra_ldflags(self):
+        if self.build_for_cpu:
+            return ['-fopenmp']
+
         if not self.is_rocm_pytorch():
             return ['-lcurand']
-        else:
-            return []
+
+        #return []
+        return ['-liomp5']
 
     def cxx_args(self):
         import torch
-        if not self.is_rocm_pytorch():
-            CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
-        else:
-            CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.ROCM_HOME, "lib")
+        args = []
+        if not self.build_for_cpu:
+            if not self.is_rocm_pytorch():
+                CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
+            else:
+                CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.ROCM_HOME, "lib")
+
+            args += super().cxx_args()
+            args += [
+                f'-L{CUDA_LIB64}',
+                '-lcudart',
+                '-lcublas',
+                '-g',
+            ]
+
         CPU_ARCH = self.cpu_arch()
         SIMD_WIDTH = self.simd_width()
-
-        args = super().cxx_args()
-        ###aiss debug0506###########
+        CUDA_ENABLE = self.is_cuda_enable()
         args += [
-            f'-L{CUDA_LIB64}',
-            #'-lcudart',
-            #'-lcublas',
-            '-lrocblas',
-            '-lhipblas',
-            '-g',
             CPU_ARCH,
             '-fopenmp',
             SIMD_WIDTH,
+            CUDA_ENABLE,
         ]
+
         return args
diff --git a/op_builder/cpu_adagrad.py b/op_builder/cpu_adagrad.py
index 24f0ff3ff52505c11c61a188f34a2b15e68fc5bc..bf7c98052fc541f2de5a6c331d414d522bc9e821 100644
--- a/op_builder/cpu_adagrad.py
+++ b/op_builder/cpu_adagrad.py
@@ -16,11 +16,25 @@ class CPUAdagradBuilder(TorchCPUOpBuilder):
         return f'deepspeed.ops.adagrad.{self.NAME}_op'
 
     def sources(self):
+        if self.build_for_cpu:
+            return ['csrc/adagrad/cpu_adagrad.cpp']
+
         return ['csrc/adagrad/cpu_adagrad.cpp', 'csrc/common/custom_cuda_kernel.cu']
 
+    def libraries_args(self):
+        args = super().libraries_args()
+        if self.build_for_cpu:
+            return args
+
+        if not self.is_rocm_pytorch():
+            args += ['curand']
+        return args
+
     def include_paths(self):
         import torch
-        if not self.is_rocm_pytorch():
+        if self.build_for_cpu:
+            CUDA_INCLUDE = []
+        elif not self.is_rocm_pytorch():
             CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
         else:
             CUDA_INCLUDE = [
@@ -34,11 +48,3 @@ class CPUAdagradBuilder(TorchCPUOpBuilder):
                              "hiprand"),
             ]
         return ['csrc/includes'] + CUDA_INCLUDE
-
-   ##aiss add
-    def cxx_args(self):
-
-        return [
-            '-lrocblas',
-            '-lhipblas',
-            '-D__HIP_DEVICE_COMPILE__',]
diff --git a/op_builder/cpu_adam.py b/op_builder/cpu_adam.py
index 35a2a8a17d54b3f60a70e2ce8fee32488372a75c..ed5f32929a22cd754c0466450fd0306ee040bbf6 100644
--- a/op_builder/cpu_adam.py
+++ b/op_builder/cpu_adam.py
@@ -8,7 +8,7 @@ from .builder import TorchCPUOpBuilder
 class CPUAdamBuilder(TorchCPUOpBuilder):
     BUILD_VAR = "DS_BUILD_CPU_ADAM"
     NAME = "cpu_adam"
-
+     
     def __init__(self):
         super().__init__(name=self.NAME)
 
@@ -16,19 +16,25 @@ class CPUAdamBuilder(TorchCPUOpBuilder):
         return f'deepspeed.ops.adam.{self.NAME}_op'
 
     def sources(self):
+        if self.build_for_cpu:
+            return ['csrc/adam/cpu_adam.cpp']
+   
         return ['csrc/adam/cpu_adam.cpp', 'csrc/common/custom_cuda_kernel.cu']
 
     def libraries_args(self):
         args = super().libraries_args()
-        #args += ['curand']
-    #aiss debug 0506########
-        args += ['hiprand']
-        args += ['rocrand']
+        if self.build_for_cpu:
+            return args
+
+        if not self.is_rocm_pytorch():
+            args += ['curand']
         return args
 
     def include_paths(self):
         import torch
-        if not self.is_rocm_pytorch():
+        if self.build_for_cpu:
+            CUDA_INCLUDE = []
+        elif not self.is_rocm_pytorch():
             CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
         else:
             CUDA_INCLUDE = [
@@ -42,12 +48,3 @@ class CPUAdamBuilder(TorchCPUOpBuilder):
                              "hiprand"),
             ]
         return ['csrc/includes'] + CUDA_INCLUDE
-
-   ##aiss add
-    def cxx_args(self):
-
-        return [
-            '-lrocblas',
-            '-lhipblas',
-            '-D__HIP_DEVICE_COMPILE__',
-        ]
diff --git a/op_builder/fused_adam.py b/op_builder/fused_adam.py
index 6ff264fbf1a1089a2d35afc520f98d9d7548f924..2883d417ede9e0e66b356ff26669e0f80a20419c 100644
--- a/op_builder/fused_adam.py
+++ b/op_builder/fused_adam.py
@@ -3,6 +3,8 @@ Copyright 2020 The Microsoft DeepSpeed Team
 """
 from .builder import CUDAOpBuilder
 
+import sys
+
 
 class FusedAdamBuilder(CUDAOpBuilder):
     BUILD_VAR = "DS_BUILD_FUSED_ADAM"
@@ -27,6 +29,9 @@ class FusedAdamBuilder(CUDAOpBuilder):
     def nvcc_args(self):
         nvcc_flags = ['-O3'] + self.version_dependent_macros()
         if not self.is_rocm_pytorch():
-            nvcc_flags.extend(['-lineinfo',
-                               '--use_fast_math'] + self.compute_capability_args())
+            nvcc_flags.extend([
+                '-allow-unsupported-compiler' if sys.platform == "win32" else '',
+                '-lineinfo',
+                '--use_fast_math'
+            ] + self.compute_capability_args())
         return nvcc_flags
diff --git a/op_builder/fused_lamb.py b/op_builder/fused_lamb.py
index 106728f6f3fe9e6449deeb0228410e50cfc8648a..d5f88d0b1ad1630950e319bc941b04d91b13338a 100644
--- a/op_builder/fused_lamb.py
+++ b/op_builder/fused_lamb.py
@@ -3,6 +3,8 @@ Copyright 2020 The Microsoft DeepSpeed Team
 """
 from .builder import CUDAOpBuilder
 
+import sys
+
 
 class FusedLambBuilder(CUDAOpBuilder):
     BUILD_VAR = 'DS_BUILD_FUSED_LAMB'
@@ -33,6 +35,9 @@ class FusedLambBuilder(CUDAOpBuilder):
                 '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
             ]
         else:
-            nvcc_flags.extend(['-lineinfo',
-                               '--use_fast_math'] + self.compute_capability_args())
+            nvcc_flags.extend([
+                '-allow-unsupported-compiler' if sys.platform == "win32" else '',
+                '-lineinfo',
+                '--use_fast_math'
+            ] + self.compute_capability_args())
         return nvcc_flags
diff --git a/op_builder/quantizer.py b/op_builder/quantizer.py
index 43bc5778ea20fbb658bcde6c6f96d0fc5f840e3b..e2c2c9564a29d194ce28b6107e90b7db4da6ba95 100644
--- a/op_builder/quantizer.py
+++ b/op_builder/quantizer.py
@@ -1,3 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from .builder import CUDAOpBuilder
 
 
@@ -15,8 +17,13 @@ class QuantizerBuilder(CUDAOpBuilder):
     def sources(self):
         return [
             'csrc/quantization/pt_binding.cpp',
-            'csrc/quantization/quantizer.cu',
+            'csrc/quantization/fake_quantizer.cu',
+            'csrc/quantization/quantize.cu',
+            'csrc/quantization/dequantize.cu',
         ]
 
     def include_paths(self):
         return ['csrc/includes']
+
+    def extra_ldflags(self):
+        return ['-lcurand']
diff --git a/deepspeed/ops/op_builder/transformer.py b/op_builder/random_ltd.py
similarity index 53%
rename from deepspeed/ops/op_builder/transformer.py
rename to op_builder/random_ltd.py
index 239f29552d980984dae4884e5a3272e6a30b68ce..79c86c1346c5bf89f87dba5758017999c2b9faae 100644
--- a/deepspeed/ops/op_builder/transformer.py
+++ b/op_builder/random_ltd.py
@@ -1,19 +1,19 @@
 """
-Copyright 2020 The Microsoft DeepSpeed Team
+Copyright 2022 The Microsoft DeepSpeed Team
 """
 from .builder import CUDAOpBuilder
 
 
-class TransformerBuilder(CUDAOpBuilder):
-    BUILD_VAR = "DS_BUILD_TRANSFORMER"
-    NAME = "transformer"
+class RandomLTDBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_RANDOM_LTD"
+    NAME = "random_ltd"
 
     def __init__(self, name=None):
         name = self.NAME if name is None else name
         super().__init__(name=name)
 
     def absolute_name(self):
-        return f'deepspeed.ops.transformer.{self.NAME}_op'
+        return f'deepspeed.ops.{self.NAME}_op'
 
     def extra_ldflags(self):
         if not self.is_rocm_pytorch():
@@ -23,14 +23,10 @@ class TransformerBuilder(CUDAOpBuilder):
 
     def sources(self):
         return [
-            'csrc/transformer/ds_transformer_cuda.cpp',
-            'csrc/transformer/cublas_wrappers.cu',
-            'csrc/transformer/transform_kernels.cu',
-            'csrc/transformer/gelu_kernels.cu',
-            'csrc/transformer/dropout_kernels.cu',
-            'csrc/transformer/normalize_kernels.cu',
-            'csrc/transformer/softmax_kernels.cu',
-            'csrc/transformer/general_kernels.cu'
+            'csrc/random_ltd/pt_binding.cpp',
+            'csrc/random_ltd/gather_scatter.cu',
+            'csrc/random_ltd/slice_attn_masks.cu',
+            'csrc/random_ltd/token_sort.cu'
         ]
 
     def include_paths(self):
diff --git a/op_builder/sparse_attn.py b/op_builder/sparse_attn.py
index 004fdd698200f4c8d47831297a5a3306352e4bef..25d5f63a01e97df553656d096cf4544b6b66c9f2 100644
--- a/op_builder/sparse_attn.py
+++ b/op_builder/sparse_attn.py
@@ -1,7 +1,6 @@
 """
 Copyright 2020 The Microsoft DeepSpeed Team
 """
-import warnings
 from .builder import OpBuilder
 
 try:
@@ -32,11 +31,12 @@ class SparseAttnBuilder(OpBuilder):
         #command_status = list(map(self.command_exists, required_commands))
         #deps_compatible = all(command_status)
 
-#####aiss debug 0506##############
         if self.is_rocm_pytorch():
-        #    self.warning(f'{self.NAME} is not compatible with ROCM')
-        #    return False
+        #aiss debug
+            #self.warning(f'{self.NAME} is not compatible with ROCM')
+            #return False
             return True
+
         try:
             import torch
         except ImportError:
diff --git a/op_builder/spatial_inference.py b/op_builder/spatial_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..18d19d40e86f1c964012da3183e62ab00603b09a
--- /dev/null
+++ b/op_builder/spatial_inference.py
@@ -0,0 +1,45 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+from .builder import CUDAOpBuilder, installed_cuda_version
+
+
+class SpatialInferenceBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_SPATIAL_INFERENCE"
+    NAME = "spatial_inference"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.spatial.{self.NAME}_op'
+
+    def is_compatible(self, verbose=True):
+        try:
+            import torch
+        except ImportError:
+            self.warning(
+                "Please install torch if trying to pre-compile inference kernels")
+            return False
+
+        cuda_okay = True
+        if not self.is_rocm_pytorch() and torch.cuda.is_available():
+            sys_cuda_major, _ = installed_cuda_version()
+            torch_cuda_major = int(torch.version.cuda.split('.')[0])
+            cuda_capability = torch.cuda.get_device_properties(0).major
+            if cuda_capability >= 8:
+                if torch_cuda_major < 11 or sys_cuda_major < 11:
+                    self.warning(
+                        "On Ampere and higher architectures please use CUDA 11+")
+                    cuda_okay = False
+        return super().is_compatible(verbose) and cuda_okay
+
+    def sources(self):
+        return [
+            'csrc/spatial/csrc/opt_bias_add.cu',
+            'csrc/spatial/csrc/pt_binding.cpp',
+        ]
+
+    def include_paths(self):
+        return ['csrc/spatial/includes', 'csrc/includes']
diff --git a/op_builder/transformer_inference.py b/op_builder/transformer_inference.py
old mode 100644
new mode 100755
index 23eab4886e80e4026e738d17411e54a9f68448d7..9bb9bbb956b3e181c1b044df6eb422489bb97386
--- a/op_builder/transformer_inference.py
+++ b/op_builder/transformer_inference.py
@@ -1,4 +1,6 @@
-from .builder import CUDAOpBuilder
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .builder import CUDAOpBuilder, installed_cuda_version
 
 
 class InferenceBuilder(CUDAOpBuilder):
@@ -12,14 +14,53 @@ class InferenceBuilder(CUDAOpBuilder):
     def absolute_name(self):
         return f'deepspeed.ops.transformer.inference.{self.NAME}_op'
 
+    def is_compatible(self, verbose=True):
+        try:
+            import torch
+        except ImportError:
+            self.warning(
+                "Please install torch if trying to pre-compile inference kernels")
+            return False
+
+        cuda_okay = True
+        if not self.is_rocm_pytorch() and torch.cuda.is_available():
+            sys_cuda_major, _ = installed_cuda_version()
+            torch_cuda_major = int(torch.version.cuda.split('.')[0])
+            cuda_capability = torch.cuda.get_device_properties(0).major
+            if cuda_capability < 6:
+                self.warning(
+                    "NVIDIA Inference is only supported on Pascal and newer architectures"
+                )
+                cuda_okay = False
+            if cuda_capability >= 8:
+                if torch_cuda_major < 11 or sys_cuda_major < 11:
+                    self.warning(
+                        "On Ampere and higher architectures please use CUDA 11+")
+                    cuda_okay = False
+        return super().is_compatible(verbose) and cuda_okay
+
+    def filter_ccs(self, ccs):
+        ccs_retained = []
+        ccs_pruned = []
+        for cc in ccs:
+            if int(cc[0]) >= 6:
+                ccs_retained.append(cc)
+            else:
+                ccs_pruned.append(cc)
+        if len(ccs_pruned) > 0:
+            self.warning(f"Filtered compute capabilities {ccs_pruned}")
+        return ccs_retained
+
     def sources(self):
         return [
             'csrc/transformer/inference/csrc/pt_binding.cpp',
             'csrc/transformer/inference/csrc/gelu.cu',
-            'csrc/transformer/inference/csrc/normalize.cu',
+            'csrc/transformer/inference/csrc/relu.cu',
+            'csrc/transformer/inference/csrc/layer_norm.cu',
             'csrc/transformer/inference/csrc/softmax.cu',
             'csrc/transformer/inference/csrc/dequantize.cu',
             'csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu',
+            'csrc/transformer/inference/csrc/transform.cu',
         ]
 
     def extra_ldflags(self):
@@ -29,4 +70,4 @@ class InferenceBuilder(CUDAOpBuilder):
             return []
 
     def include_paths(self):
-        return ['csrc/transformer/inference/includes']
+        return ['csrc/transformer/inference/includes', 'csrc/includes']
diff --git a/release/bump_patch_version.py b/release/bump_patch_version.py
index 8f1150deab50d5ce990daac7ab86caa5f278a1ea..40d9badf09ecf9099b6227e198e4830f949c9ee6 100644
--- a/release/bump_patch_version.py
+++ b/release/bump_patch_version.py
@@ -1,3 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from packaging import version as pkg_version
 
 with open('../version.txt') as fd:
diff --git a/requirements/requirements-autotuning.txt b/requirements/requirements-autotuning.txt
old mode 100644
new mode 100755
diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
index 313379c4ecc250f77123cddfce24af9cfcadcb8c..a5ee03c61b555264716f36269462d00766626fce 100644
--- a/requirements/requirements-dev.txt
+++ b/requirements/requirements-dev.txt
@@ -1,8 +1,9 @@
-clang-format
+clang-format>=14.0.6
 docutils<0.18
+future
 importlib-metadata>=4
 megatron-lm==1.1.5
-pre-commit
+pre-commit>=2.20.0
 pytest
 pytest-forked
 pytest-randomly
@@ -10,4 +11,7 @@ pytest-xdist
 recommonmark
 sphinx
 sphinx-rtd-theme
-torchvision
+tensorboard
+#torchvision
+transformers
+wandb
diff --git a/requirements/requirements-inf.txt b/requirements/requirements-inf.txt
new file mode 100644
index 0000000000000000000000000000000000000000..848a7f7a485de46d9941b3de6919b42724f76409
--- /dev/null
+++ b/requirements/requirements-inf.txt
@@ -0,0 +1,5 @@
+google
+lm-eval==0.3.0
+protobuf
+transformers
+transformers[sentencepiece]
diff --git a/requirements/requirements-readthedocs.txt b/requirements/requirements-readthedocs.txt
index f3ffe3b615a2433e52b475a82901408b33e9ae2e..fcd0ec5a9a6a4846ce7066f21953bb0af695af29 100644
--- a/requirements/requirements-readthedocs.txt
+++ b/requirements/requirements-readthedocs.txt
@@ -1,5 +1,9 @@
+autodoc_pydantic
 docutils<0.18
 hjson
+packaging
 psutil
+py-cpuinfo
+pydantic
 torch
 tqdm
diff --git a/requirements/requirements-sd.txt b/requirements/requirements-sd.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c9026206a7379e89ca084a652e23b2f5e5904fe1
--- /dev/null
+++ b/requirements/requirements-sd.txt
@@ -0,0 +1,2 @@
+diffusers
+triton==2.0.0.dev20221005
diff --git a/requirements/requirements-sparse_attn.txt b/requirements/requirements-sparse_attn.txt
old mode 100644
new mode 100755
index 7688e3e1ba2139501dc773871c962ed4d5ad955b..f929bb0168a5103a81c3102406991d2580578536
--- a/requirements/requirements-sparse_attn.txt
+++ b/requirements/requirements-sparse_attn.txt
@@ -1 +1 @@
-#triton==1.0.0
+triton==1.0.0
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
old mode 100644
new mode 100755
index 895e252a454ffb47e3836652b6127a94feaac84c..080812208bc23eb46faf6e951750404efc45e018
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,8 +1,9 @@
 hjson
 ninja
 numpy
-packaging
+packaging>=20.0
 psutil
 py-cpuinfo
-torch
+pydantic
+#torch
 tqdm
diff --git a/run.sh b/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..193ba16b024776bf2571a62de7e2b4aa8e48ddb1
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+source `pwd`/env.sh
+export DS_BUILD_VERSION=dtk22.10.1
+DS_BUILD_RANDOM_LTD=0 DS_BUILD_QUANTIZER=0 DS_BUILD_TRANSFORMER_INFERENCE=0 DS_BUILD_OPS=1 verbose=1 CXX=hipcc CC=hipcc python3 setup.py install bdist_wheel
diff --git a/scripts/check-license.py b/scripts/check-license.py
new file mode 100755
index 0000000000000000000000000000000000000000..519827d7df670c06bd6aedd1817d3c49f3101f6a
--- /dev/null
+++ b/scripts/check-license.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+'''Copyright The Microsoft DeepSpeed Team'''
+"""
+Modified from https://github.com/jlebar/pre-commit-hooks/blob/master/check_do_not_submit.py
+"""
+
+import subprocess
+import sys
+
+
+def err(s: str) -> None:
+    print(s, file=sys.stderr)
+
+
+success = True
+failures = []
+for f in sys.argv[1:]:
+    res = subprocess.run(
+        ["git",
+         "grep",
+         "--quiet",
+         "-e",
+         r"Copyright .* DeepSpeed Team",
+         f],
+        capture_output=True)
+    if res.returncode == 1:
+        success = False
+        failures.append(f)
+    elif res.returncode == 2:
+        err(f"Error invoking grep on {', '.join(sys.argv[1:])}:")
+        err(res.stderr.decode("utf-8"))
+        sys.exit(2)
+
+if not success:
+    err(f'{failures}: Missing license at top of file')
+    err(res.stdout.decode("utf-8"))
+    sys.exit(1)
diff --git a/scripts/check-torchdist.py b/scripts/check-torchdist.py
new file mode 100755
index 0000000000000000000000000000000000000000..d655b7b9008e2b47ee1b07fe3b4b44a3fc5d5c56
--- /dev/null
+++ b/scripts/check-torchdist.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+'''Copyright The Microsoft DeepSpeed Team'''
+"""
+Checks each file in sys.argv for the string "torch.distributed".
+Modified from https://github.com/jlebar/pre-commit-hooks/blob/master/check_do_not_submit.py
+"""
+
+import subprocess
+import sys
+
+
+def err(s: str) -> None:
+    print(s, file=sys.stderr)
+
+
+# There are many ways we could search for the string "torch.distributed", but `git
+# grep --no-index` is nice because
+#  - it's very fast (as compared to iterating over the file in Python)
+#  - we can reasonably assume it's available on all machines
+#  - unlike plain grep, which is slower and has different flags on MacOS versus
+#    Linux, git grep is always the same.
+res = subprocess.run(
+    ["git",
+     "grep",
+     "-Hn",
+     "--no-index",
+     r"torch\.distributed",
+     *sys.argv[1:]],
+    capture_output=True,
+)
+if res.returncode == 0:
+    err('Error: The string "torch.distributed" was found. Please replace all calls to torch.distributed with "deepspeed.comm"'
+        )
+    err(res.stdout.decode("utf-8"))
+    sys.exit(1)
+elif res.returncode == 2:
+    err(f"Error invoking grep on {', '.join(sys.argv[1:])}:")
+    err(res.stderr.decode("utf-8"))
+    sys.exit(2)
diff --git a/setup.py b/setup.py
old mode 100644
new mode 100755
index 0842ecb0caa6edbfc98556b5d3bde361ebd4ac4a..d7607405d464df0712410370b7e1e1e30a124c91
--- a/setup.py
+++ b/setup.py
@@ -4,42 +4,39 @@ Copyright 2020 The Microsoft DeepSpeed Team
 DeepSpeed library
 
 To build wheel on Windows:
-    1. Install pytorch, such as pytorch 1.8 + cuda 11.1
+    1. Install pytorch, such as pytorch 1.12 + cuda 11.6
     2. Install visual cpp build tool
-    3. Launch cmd console with Administrator privilege for creating required symlink folders
+    3. Include cuda toolkit
+    4. Launch cmd console with Administrator privilege for creating required symlink folders
 
 Create a new wheel via the following command:
-    python setup.py bdist_wheel
+    build_win.bat
 
 The wheel will be located at: dist/*.whl
 """
 
 import os
 import sys
-import shutil
 import subprocess
-import warnings
-from setuptools import setup, find_packages
+from setuptools import setup, find_packages, find_namespace_packages
 from setuptools.command import egg_info
 import time
 
 torch_available = True
 try:
     import torch
-    from torch.utils.cpp_extension import BuildExtension
 except ImportError:
     torch_available = False
     print('[WARNING] Unable to import torch, pre-compiling ops will be disabled. ' \
         'Please visit https://pytorch.org/ to see how to properly install torch on your system.')
 
-from op_builder import ALL_OPS, get_default_compute_capabilities, OpBuilder
+from op_builder import get_default_compute_capabilities, OpBuilder
+from op_builder.all_ops import ALL_OPS
+from op_builder.builder import installed_cuda_version
 
 # fetch rocm state
 is_rocm_pytorch = OpBuilder.is_rocm_pytorch()
 rocm_version = OpBuilder.installed_rocm_version()
-#aiss
-print("is_rocm_pytorch: ", is_rocm_pytorch)
-print("rocm_version: ", rocm_version)
 
 RED_START = '\033[31m'
 RED_END = '\033[0m'
@@ -64,7 +61,9 @@ extras_require = {
     'dev': fetch_requirements('requirements/requirements-dev.txt'),
     'autotuning': fetch_requirements('requirements/requirements-autotuning.txt'),
     'autotuning_ml': fetch_requirements('requirements/requirements-autotuning-ml.txt'),
-    'sparse_attn': fetch_requirements('requirements/requirements-sparse_attn.txt')
+    'sparse_attn': fetch_requirements('requirements/requirements-sparse_attn.txt'),
+    'inf': fetch_requirements('requirements/requirements-inf.txt'),
+    'sd': fetch_requirements('requirements/requirements-sd.txt')
 }
 
 # Add specific cupy version to both onebit extension variants
@@ -76,7 +75,7 @@ if torch_available and torch.cuda.is_available():
         if rocm_major <= 4:
             cupy = f"cupy-rocm-{rocm_major}-{rocm_minor}"
     else:
-        cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}"
+        cupy = f"cupy-cuda{''.join(map(str,installed_cuda_version()))}"
     if cupy:
         extras_require['1bit'].append(cupy)
         extras_require['1bit_mpi'].append(cupy)
@@ -92,7 +91,9 @@ cmdclass = {}
 
 # For any pre-installed ops force disable ninja
 if torch_available:
-    cmdclass['build_ext'] = BuildExtension.with_options(use_ninja=False)
+    from accelerator import get_accelerator
+    cmdclass['build_ext'] = get_accelerator().build_extension().with_options(
+        use_ninja=False)
 
 if torch_available:
     TORCH_MAJOR = torch.__version__.split('.')[0]
@@ -135,7 +136,7 @@ def op_envvar(op_name):
         f"{op_name} is missing BUILD_VAR field"
     return ALL_OPS[op_name].BUILD_VAR
 
-sparse_env='DS_BUILD_SPARSE_ATTN'
+
 def op_enabled(op_name):
     env_var = op_envvar(op_name)
     return int(os.environ.get(env_var, BUILD_OP_DEFAULT))
@@ -146,21 +147,21 @@ install_ops = dict.fromkeys(ALL_OPS.keys(), False)
 for op_name, builder in ALL_OPS.items():
     op_compatible = builder.is_compatible()
     compatible_ops[op_name] = op_compatible
+    #aiss
+    print("op_name: ", op_name)
+    print("op_enabled: ", op_enabled(op_name))
+    print("op_compatible: ", op_compatible)
     # If op is requested but not available, throw an error
     if op_enabled(op_name) and not op_compatible:
         env_var = op_envvar(op_name)
         if env_var not in os.environ:
             builder.warning(f"One can disable {op_name} with {env_var}=0")
         abort(f"Unable to pre-compile {op_name}")
-    print(f"op_name: {op_name}")
-    # If op is compatible update install reqs so it can potentially build/run later
-    if op_compatible:
-        reqs = builder.python_requirements()
-        install_requires += builder.python_requirements()
-       
+
     # if op is compatible but install is not enabled (JIT mode)
     if is_rocm_pytorch and op_compatible and not op_enabled(op_name):
         builder.hipify_extension()
+
     # If op install enabled, add builder to extensions
     if op_enabled(op_name) and op_compatible:
         assert torch_available, f"Unable to pre-compile {op_name}, please first install torch"
@@ -178,12 +179,11 @@ if command_exists('git') and 'DS_BUILD_STRING' not in os.environ:
         git_hash = result.decode('utf-8').strip()
         result = subprocess.check_output(git_branch_cmd, shell=True)
         git_branch = result.decode('utf-8').strip()
-        ##aiss 20220810
+#add dtk version
         if os.getenv('DS_BUILD_VERSION'):
             version_dtk = os.getenv('DS_BUILD_VERSION', "")
             git_hash += "." + version_dtk
 
-        #git_hash +="dtk22.04"
     except subprocess.CalledProcessError:
         git_hash = "unknown"
         git_branch = "unknown"
@@ -205,13 +205,14 @@ if sys.platform == "win32":
     # It needs Administrator privilege to create symlinks on Windows.
     create_dir_symlink('..\\..\\csrc', '.\\deepspeed\\ops\\csrc')
     create_dir_symlink('..\\..\\op_builder', '.\\deepspeed\\ops\\op_builder')
+    create_dir_symlink('..\\accelerator', '.\\deepspeed\\accelerator')
     egg_info.manifest_maker.template = 'MANIFEST_win.in'
 
 # Parse the DeepSpeed version string from version.txt
 version_str = open('version.txt', 'r').read().strip()
 
 # Build specifiers like .devX can be added at install time. Otherwise, add the git hash.
-# example: DS_BUILD_STR=".dev20201022" python setup.py sdist bdist_wheel
+# example: DS_BUILD_STRING=".dev20201022" python setup.py sdist bdist_wheel
 
 # Building wheel for distribution, update version file
 if 'DS_BUILD_STRING' in os.environ:
@@ -226,20 +227,31 @@ elif os.path.isfile('build.txt'):
 else:
     # None of the above, probably installing from source
     version_str += f'+{git_hash}'
-    print("version_str: ",version_str)
 
 torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR])
+bf16_support = False
 # Set cuda_version to 0.0 if cpu-only
 cuda_version = "0.0"
+nccl_version = "0.0"
 # Set hip_version to 0.0 if cpu-only
 hip_version = "0.0"
 if torch_available and torch.version.cuda is not None:
     cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+    if sys.platform != "win32":
+        if isinstance(torch.cuda.nccl.version(), int):
+            # This will break if minor version > 9
+            nccl_version = ".".join(str(torch.cuda.nccl.version())[:2])
+        else:
+            nccl_version = ".".join(map(str, torch.cuda.nccl.version()[:2]))
+    if hasattr(torch.cuda, 'is_bf16_supported') and torch.cuda.is_available():
+        bf16_support = torch.cuda.is_bf16_supported()
 if torch_available and hasattr(torch.version, 'hip') and torch.version.hip is not None:
     hip_version = ".".join(torch.version.hip.split('.')[:2])
 torch_info = {
     "version": torch_version,
+    "bf16_support": bf16_support,
     "cuda_version": cuda_version,
+    "nccl_version": nccl_version,
     "hip_version": hip_version
 }
 
@@ -277,10 +289,10 @@ setup(name='deepspeed',
       },
       install_requires=install_requires,
       extras_require=extras_require,
-      packages=find_packages(exclude=["docker",
-                                      "third_party",
-                                      "csrc",
-                                      "op_builder"]),
+      #packages=find_packages(include=['deepspeed',
+      #                                'deepspeed.*']),
+      packages=find_namespace_packages(include=['deepspeed',
+                                      'deepspeed.*']),
       include_package_data=True,
       scripts=[
           'bin/deepspeed',
@@ -288,13 +300,16 @@ setup(name='deepspeed',
           'bin/ds',
           'bin/ds_ssh',
           'bin/ds_report',
+          'bin/ds_bench',
+          'bin/dsr',
           'bin/ds_elastic'
       ],
       classifiers=[
           'Programming Language :: Python :: 3.6',
           'Programming Language :: Python :: 3.7',
           'Programming Language :: Python :: 3.8',
-          'Programming Language :: Python :: 3.9'
+          'Programming Language :: Python :: 3.9',
+          'Programming Language :: Python :: 3.10'
       ],
       license='MIT',
       ext_modules=ext_modules,
diff --git a/tests/accelerator/ds_config.json b/tests/accelerator/ds_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e9ac6b889eaebf80e246fad923fe967466ca14e
--- /dev/null
+++ b/tests/accelerator/ds_config.json
@@ -0,0 +1,19 @@
+{
+  "train_batch_size": 1,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 1,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00015,
+      "weight_decay": 1e-2
+    }
+  },
+  "fp16": {
+    "enabled": false,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  }
+}
diff --git a/tests/accelerator/test_ds_init.py b/tests/accelerator/test_ds_init.py
new file mode 100644
index 0000000000000000000000000000000000000000..d535e9918052a9168faa811b56d69b3c8e447e90
--- /dev/null
+++ b/tests/accelerator/test_ds_init.py
@@ -0,0 +1,47 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+import os
+import torch
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+
+
+class OneLayerNet(torch.nn.Module):
+    def __init__(self, D_in, D_out):
+        """
+        In the constructor we instantiate two nn.Linear modules and assign them as
+        member variables.
+        """
+        super(OneLayerNet, self).__init__()
+        self.linear1 = torch.nn.Linear(D_in, D_out)
+
+    def forward(self, x):
+        """
+        In the forward function we accept a Variable of input data and we must return
+        a Variable of output data. We can use Modules defined in the constructor as
+        well as arbitrary operators on Variables.
+        """
+        h_relu = self.linear1(x).clamp(min=0)
+        y_pred = self.linear1(h_relu)
+        return y_pred
+
+
+def test_literal_device():
+    model = OneLayerNet(128, 128)
+
+    os.environ['RANK'] = '0'
+    os.environ['WORLD_SIZE'] = '1'
+    os.environ['MASTER_ADDR'] = '127.0.0.1'
+    os.environ['MASTER_PORT'] = '8088'
+    os.environ['LOCAL_RANK'] = '0'
+    deepspeed.init_distributed(get_accelerator().communication_backend_name())
+    deepspeed.initialize(model=model, config='ds_config.json')
+    string = get_accelerator().device_name()  #'xpu' or 'cuda'
+    string0 = get_accelerator().device_name(0)  #'xpu:0' or 'cuda:0'
+    string1 = get_accelerator().device_name(1)  #'xpu:1' or 'cuda:1'
+    #aiss
+    print(string0)
+    print(string1)
+    
+    assert string == 'xpu' or string == 'cuda'
+    assert string0 == 'xpu:0' or string0 == 'cuda:0'
+    assert string1 == 'xpu:1' or string1 == 'cuda:1'
diff --git a/tests/benchmarks/flatten_bench.py b/tests/benchmarks/flatten_bench.py
old mode 100644
new mode 100755
index a337a1b525f35045f50132e07cabee7a3303c30c..1082554f81d16a8d389a866af49586d4ed140d5e
--- a/tests/benchmarks/flatten_bench.py
+++ b/tests/benchmarks/flatten_bench.py
@@ -1,3 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 #!/usr/bin/env python
 # run the benchmark under timeit (-t), cProfile (-c), line_profiler (-l)
 #
@@ -11,7 +13,8 @@ import argparse
 import gc
 
 import torch
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch._utils import _flatten_dense_tensors
+from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import UtilsBuilder
 
 from apex_C import flatten as flatten_apex
@@ -24,11 +27,11 @@ torch.manual_seed(0)
 # emulate a small typical model weights
 x = [
     torch.rand((512,
-                512)).cuda(),
+                512)).to(get_accelerator().device_name()),
     torch.rand((512,
-                1024)).cuda(),
+                1024)).to(get_accelerator().device_name()),
     torch.rand((512,
-                30000)).cuda()
+                30000)).to(get_accelerator().device_name())
 ]
 t = x * 30
 
@@ -69,15 +72,15 @@ def cprofileme():
     print("py")
     cProfile.run("py()", sort=-1)
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print("cpp")
     cProfile.run("cpp()", sort=-1)
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print("apex")
     cProfile.run("apex()", sort=-1)
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
 
 
 #### timeit ####
@@ -89,13 +92,13 @@ def timeme():
     print("--------------- timeit -----------------")
     print(f'py  ={timeit.Timer("py()", globals=globals()).timeit(number=1)}')
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}')
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}')
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
 
 
 #### line_profiler ####
@@ -107,17 +110,17 @@ def timeme():
 def line_profileme():
     print("--------------- line_profiler -----------------")
     print("py")
-    profile(py)()
+    profile(py)()  # noqa: F821
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print("cpp")
-    profile(cpp)()
+    profile(cpp)()  # noqa: F821
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print("apex")
-    profile(apex)()
+    profile(apex)()  # noqa: F821
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
 
 
 if __name__ == "__main__":
diff --git a/tests/benchmarks/unflatten_bench.py b/tests/benchmarks/unflatten_bench.py
old mode 100644
new mode 100755
index 85baf751ad9c886252aac0f1fafd07ff5ebb1044..a4a1b63b3dd0a4e5347d93cac365854e0d711013
--- a/tests/benchmarks/unflatten_bench.py
+++ b/tests/benchmarks/unflatten_bench.py
@@ -1,3 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 #!/usr/bin/env python
 
 # run the benchmark under timeit (-t), cProfile (-c), line_profiler (-l)
@@ -11,6 +13,7 @@ import argparse
 import gc
 import torch
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import UtilsBuilder
 
 from apex_C import flatten as flatten_apex
@@ -24,11 +27,11 @@ torch.manual_seed(0)
 # emulate a small typical model weights
 x = [
     torch.rand((512,
-                512)).cuda(),
+                512)).to(get_accelerator().device_name()),
     torch.rand((512,
-                1024)).cuda(),
+                1024)).to(get_accelerator().device_name()),
     torch.rand((512,
-                30000)).cuda()
+                30000)).to(get_accelerator().device_name())
 ]
 unflat_t = x * 30
 
@@ -78,15 +81,15 @@ def cprofileme():
     print("py")
     cProfile.run("py()", sort=-1)
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print("cpp")
     cProfile.run("cpp()", sort=-1)
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print("apex")
     cProfile.run("apex()", sort=-1)
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
 
 
 #### timeit ####
@@ -98,13 +101,13 @@ def timeme():
     print("--------------- timeit -----------------")
     print(f'py  ={timeit.Timer("py()", globals=globals()).timeit(number=1)}')
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}')
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}')
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
 
 
 #### line_profiler ####
@@ -116,17 +119,17 @@ def timeme():
 def line_profileme():
     print("--------------- line_profier -----------------")
     print("py")
-    profile(py)()
+    profile(py)()  # noqa: F821
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print("cpp")
-    profile(cpp)()
+    profile(cpp)()  # noqa: F821
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print("apex")
-    profile(apex)()
+    profile(apex)()  # noqa: F821
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
 
 
 if __name__ == "__main__":
diff --git a/tests/conftest.py b/tests/conftest.py
index a0e4705f4984f0b0f595843d35a2fa42b7c72901..86662993a4fb4f4ba7cb50f26c9bc78cd0fbc955 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,9 +1,71 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 # tests directory-specific settings - this file is run automatically by pytest before any tests are run
 
 import sys
+import pytest
+import os
 from os.path import abspath, dirname, join
+import torch
+import warnings
+
+# Set this environment variable for the T5 inference unittest(s) (e.g. google/t5-v1_1-small)
+os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
 
 # allow having multiple repository checkouts and not needing to remember to rerun
 # 'pip install -e .[dev]' when switching between checkouts and running tests.
 git_repo_path = abspath(join(dirname(dirname(__file__)), "src"))
 sys.path.insert(1, git_repo_path)
+
+
+def pytest_addoption(parser):
+    parser.addoption("--torch_ver", default=None, type=str)
+    parser.addoption("--cuda_ver", default=None, type=str)
+
+
+def validate_version(expected, found):
+    version_depth = expected.count('.') + 1
+    found = '.'.join(found.split('.')[:version_depth])
+    return found == expected
+
+
+@pytest.fixture(scope="session", autouse=True)
+def check_environment(pytestconfig):
+    expected_torch_version = pytestconfig.getoption("torch_ver")
+    expected_cuda_version = pytestconfig.getoption("cuda_ver")
+    if expected_torch_version is None:
+        warnings.warn(
+            "Running test without verifying torch version, please provide an expected torch version with --torch_ver"
+        )
+    elif not validate_version(expected_torch_version, torch.__version__):
+        pytest.exit(
+            f"expected torch version {expected_torch_version} did not match found torch version {torch.__version__}",
+            returncode=2)
+    if expected_cuda_version is None:
+        warnings.warn(
+            "Running test without verifying cuda version, please provide an expected cuda version with --cuda_ver"
+        )
+    elif not validate_version(expected_cuda_version, torch.version.cuda):
+        pytest.exit(
+            f"expected cuda version {expected_cuda_version} did not match found cuda version {torch.version.cuda}",
+            returncode=2)
+
+
+# Override of pytest "runtest" for DistributedTest class
+# This hook is run before the default pytest_runtest_call
+@pytest.hookimpl(tryfirst=True)
+def pytest_runtest_call(item):
+    # We want to use our own launching function for distributed tests
+    if getattr(item.cls, "is_dist_test", False):
+        dist_test_class = item.cls()
+        dist_test_class(item._request)
+        item.runtest = lambda: True  # Dummy function so test is not run twice
+
+
+@pytest.hookimpl(tryfirst=True)
+def pytest_fixture_setup(fixturedef, request):
+    if getattr(fixturedef.func, "is_dist_fixture", False):
+        #for val in dir(request):
+        #    print(val.upper(), getattr(request, val), "\n")
+        dist_fixture_class = fixturedef.func()
+        dist_fixture_class(request)
diff --git a/tests/lightning/test_simple.py b/tests/lightning/test_simple.py
index 6fb36372a17a5a32a4efdcaa906188649854de4e..c78768bc707a3ee6723e4f17fd8ce98dbe91a7ba 100644
--- a/tests/lightning/test_simple.py
+++ b/tests/lightning/test_simple.py
@@ -1,6 +1,8 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import torch
 from pytorch_lightning import LightningModule, Trainer
-from pytorch_lightning.plugins import DeepSpeedPlugin
+from pytorch_lightning.strategies import DeepSpeedStrategy
 from torch.utils.data import DataLoader, Dataset
 
 
@@ -51,5 +53,5 @@ def test_lightning_model():
     """Test that DeepSpeed works with a simple LightningModule and LightningDataModule."""
 
     model = BoringModel()
-    trainer = Trainer(strategy=DeepSpeedPlugin(), max_epochs=1, precision=16, gpus=1)
+    trainer = Trainer(strategy=DeepSpeedStrategy(), max_epochs=1, precision=16, gpus=1)
     trainer.fit(model)
diff --git a/tests/model/BingBertSquad/BingBertSquad_run_func_test.py b/tests/model/BingBertSquad/BingBertSquad_run_func_test.py
old mode 100644
new mode 100755
index 90e6858e8bcbaac1b722e7b37210bc8a7e997ab8..828771cd324b1902f80fd4f9b5cb368760920b30
--- a/tests/model/BingBertSquad/BingBertSquad_run_func_test.py
+++ b/tests/model/BingBertSquad/BingBertSquad_run_func_test.py
@@ -3,9 +3,7 @@
 # Note: please copy webtext data to "Megatron-LM" folder, before running this script.
 
 import unittest
-import subprocess
 import os
-import time
 import re
 from .BingBertSquad_test_common import BaseTestCase
 
@@ -16,7 +14,7 @@ def grep_loss_from_file(file_name):
     with open(file_name, 'r') as f:
         lines = f.readlines()
         line_filter = "bert_squad_progress: step="
-        match_number = re.compile('loss=([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)')
+        match_number = re.compile(r'loss=([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)')
 
         for line in lines:
             if line_filter in line:
diff --git a/tests/model/BingBertSquad/BingBertSquad_test_common.py b/tests/model/BingBertSquad/BingBertSquad_test_common.py
old mode 100644
new mode 100755
index a9678bb6923fb986a9cb202fca69b98e1fe1aae8..b6069d76e69a6518e5abe99afb38f4e8cc10426e
--- a/tests/model/BingBertSquad/BingBertSquad_test_common.py
+++ b/tests/model/BingBertSquad/BingBertSquad_test_common.py
@@ -5,7 +5,6 @@ import unittest
 import subprocess
 import os
 import time
-import re
 
 
 class BaseTestCase(unittest.TestCase):
diff --git a/tests/model/BingBertSquad/__init__.py b/tests/model/BingBertSquad/__init__.py
old mode 100644
new mode 100755
diff --git a/tests/model/BingBertSquad/deepspeed_bsz24_fp16_config.json b/tests/model/BingBertSquad/deepspeed_bsz24_fp16_config.json
old mode 100644
new mode 100755
diff --git a/tests/model/BingBertSquad/deepspeed_bsz24_fp16_eigenvalue_quantize_config.json b/tests/model/BingBertSquad/deepspeed_bsz24_fp16_eigenvalue_quantize_config.json
old mode 100644
new mode 100755
diff --git a/tests/model/BingBertSquad/deepspeed_bsz24_fp16_zero2_config.json b/tests/model/BingBertSquad/deepspeed_bsz24_fp16_zero2_config.json
old mode 100644
new mode 100755
diff --git a/tests/model/BingBertSquad/deepspeed_bsz24_fp32_config.json b/tests/model/BingBertSquad/deepspeed_bsz24_fp32_config.json
old mode 100644
new mode 100755
diff --git a/tests/model/BingBertSquad/run_BingBertSquad.sh b/tests/model/BingBertSquad/run_BingBertSquad.sh
old mode 100644
new mode 100755
diff --git a/tests/model/BingBertSquad/run_BingBertSquad_sanity.sh b/tests/model/BingBertSquad/run_BingBertSquad_sanity.sh
old mode 100644
new mode 100755
diff --git a/tests/model/BingBertSquad/run_tests.sh b/tests/model/BingBertSquad/run_tests.sh
old mode 100644
new mode 100755
diff --git a/tests/model/BingBertSquad/test_e2e_squad.py b/tests/model/BingBertSquad/test_e2e_squad.py
index 0140ebd877703c2446c6dd9839951cc8040141f6..7dfd718bc6bd8cae0fb9428d05f18fb6d0c92204 100644
--- a/tests/model/BingBertSquad/test_e2e_squad.py
+++ b/tests/model/BingBertSquad/test_e2e_squad.py
@@ -1,11 +1,11 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import subprocess as sp
-import datetime
 import os
 from math import isclose
 import sys
 import pytest
 import json
-import argparse
 
 sys.path.append("../../../DeepSpeedExamples/BingBertSquad")
 import evaluate as eval
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero1.json b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero1.json
old mode 100644
new mode 100755
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2.json b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2.json
old mode 100644
new mode 100755
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2_offload.json b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2_offload.json
old mode 100644
new mode 100755
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json
old mode 100644
new mode 100755
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero0_gas3.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero0_gas3.json
old mode 100644
new mode 100755
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json
old mode 100644
new mode 100755
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json
old mode 100644
new mode 100755
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_gas3.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_gas3.json
old mode 100644
new mode 100755
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_offload.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_offload.json
old mode 100644
new mode 100755
diff --git a/tests/model/Megatron_GPT2/ds_config_func_scheduler.json b/tests/model/Megatron_GPT2/ds_config_func_scheduler.json
old mode 100644
new mode 100755
diff --git a/tests/model/Megatron_GPT2/ds_config_perf_bs16.json b/tests/model/Megatron_GPT2/ds_config_perf_bs16.json
old mode 100644
new mode 100755
diff --git a/tests/model/Megatron_GPT2/ds_config_perf_bs32.json b/tests/model/Megatron_GPT2/ds_config_perf_bs32.json
old mode 100644
new mode 100755
diff --git a/tests/model/Megatron_GPT2/ds_config_perf_bs8.json b/tests/model/Megatron_GPT2/ds_config_perf_bs8.json
old mode 100644
new mode 100755
diff --git a/tests/model/Megatron_GPT2/ds_gpt2_test.sh b/tests/model/Megatron_GPT2/ds_gpt2_test.sh
old mode 100644
new mode 100755
diff --git a/tests/model/Megatron_GPT2/run_checkpoint_test.py b/tests/model/Megatron_GPT2/run_checkpoint_test.py
old mode 100644
new mode 100755
index fe564d4fdb8afe4f6386706f3793f24f6fb8ff3d..628547ef2f14f8120ddd84fdd90bed765257fed7
--- a/tests/model/Megatron_GPT2/run_checkpoint_test.py
+++ b/tests/model/Megatron_GPT2/run_checkpoint_test.py
@@ -5,7 +5,6 @@
 import unittest
 import subprocess
 import os
-import time
 import re
 from .test_common import BaseTestCase
 
@@ -26,7 +25,7 @@ def grep_loss_from_file(file_name):
     with open(file_name, 'r') as f:
         lines = f.readlines()
         line_filter = "validation loss at the end of training for test data | LM loss:"
-        match_number = re.compile('LM loss: ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)')
+        match_number = re.compile(r'LM loss: ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)')
 
         for line in lines:
             if line_filter in line:
diff --git a/tests/model/Megatron_GPT2/run_func_test.py b/tests/model/Megatron_GPT2/run_func_test.py
old mode 100644
new mode 100755
index 463aa1f94f151191b962fdb50c7b2b9512c40726..78a685e0f0e275088c0d86afe28063b5c0454c6f
--- a/tests/model/Megatron_GPT2/run_func_test.py
+++ b/tests/model/Megatron_GPT2/run_func_test.py
@@ -3,9 +3,7 @@
 # Note: please copy webtext data to "Megatron-LM" folder, before running this script.
 
 import unittest
-import subprocess
 import os
-import time
 import re
 from .test_common import BaseTestCase
 
@@ -22,7 +20,7 @@ def grep_loss_from_file(file_name):
     with open(file_name, 'r') as f:
         lines = f.readlines()
         line_filter = "validation loss at the end of training for test data | LM loss:"
-        match_number = re.compile('LM loss: ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)')
+        match_number = re.compile(r'LM loss: ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)')
 
         for line in lines:
             if line_filter in line:
diff --git a/tests/model/Megatron_GPT2/run_perf_baseline.py b/tests/model/Megatron_GPT2/run_perf_baseline.py
old mode 100644
new mode 100755
index f30e9cfe9bc13620025e29b0b4e04e872f12ec0e..0c7233d5dc8f47d034386023205bd75970b718c7
--- a/tests/model/Megatron_GPT2/run_perf_baseline.py
+++ b/tests/model/Megatron_GPT2/run_perf_baseline.py
@@ -3,9 +3,6 @@
 # Note: please copy webtext data to "Megatron-LM" folder, before running this script.
 
 import unittest
-import subprocess
-import os
-import time
 import re
 from test_common import BaseTestCase
 
@@ -103,7 +100,7 @@ class GPT2PerfBaselineTestCase(BaseTestCase):
             lines = f.readlines()
             line_filter = "elapsed time per iteration"
             match_number = re.compile(
-                'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)'
+                r'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)'
             )
 
             for line in lines:
diff --git a/tests/model/Megatron_GPT2/run_perf_test.py b/tests/model/Megatron_GPT2/run_perf_test.py
old mode 100644
new mode 100755
index 64b20f4866a49e48e11816d063fab97b2ca98f30..f24b441291f9bc2afb16f32d892b7dd4269f1a94
--- a/tests/model/Megatron_GPT2/run_perf_test.py
+++ b/tests/model/Megatron_GPT2/run_perf_test.py
@@ -3,11 +3,9 @@
 # Note: please copy webtext data to "Megatron-LM" folder, before running this script.
 
 import unittest
-import subprocess
-import os
-import time
 import re
-from test_common import BaseTestCase
+#from test_common import BaseTestCase
+from .test_common import BaseTestCase
 
 
 class GPT2PerfTestCase(BaseTestCase):
@@ -107,7 +105,7 @@ class GPT2PerfTestCase(BaseTestCase):
             lines = f.readlines()
             line_filter = "elapsed time per iteration"
             match_number = re.compile(
-                'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)'
+                r'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)'
             )
 
             for line in lines:
diff --git a/tests/model/Megatron_GPT2/test_common.py b/tests/model/Megatron_GPT2/test_common.py
old mode 100644
new mode 100755
index 04b3e4a23a6c065c30d8618b17a98411c732c308..6f9bec89eeb5b08a637ffaf751499db5534f4ff0
--- a/tests/model/Megatron_GPT2/test_common.py
+++ b/tests/model/Megatron_GPT2/test_common.py
@@ -5,7 +5,6 @@ import unittest
 import subprocess
 import os
 import time
-import re
 
 
 class BaseTestCase(unittest.TestCase):
diff --git a/tests/model/run_sanity_check.py b/tests/model/run_sanity_check.py
old mode 100644
new mode 100755
index 2f020b52db163aa09d2c2b42831c8815a98df28f..a226ccb8ca06c10c1a1f2204981a322213ae1d17
--- a/tests/model/run_sanity_check.py
+++ b/tests/model/run_sanity_check.py
@@ -8,8 +8,6 @@ import unittest
 sys.path.append('../DeepSpeedExamples/Megatron_GPT2')
 sys.path.append('../DeepSpeedExamples/BingBertSquad')
 
-import os
-
 # Import the test cases here.
 import Megatron_GPT2
 import BingBertSquad
diff --git a/tests/onebit/test_mpi_backend.py b/tests/onebit/test_mpi_backend.py
index 785021cf0935a69344b9593cd3705fa3f2db226c..bb8915f2c00164d24bb3164fb6eb7e828872fe90 100644
--- a/tests/onebit/test_mpi_backend.py
+++ b/tests/onebit/test_mpi_backend.py
@@ -1,25 +1,28 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from mpi4py import MPI
-import time
 import torch
-import torch.distributed as dist
+import deepspeed.comm as dist
 import numpy as np
 import deepspeed
 
 from deepspeed.runtime.comm.mpi import MpiBackend
+from deepspeed.accelerator import get_accelerator
 
 comm = MPI.COMM_WORLD
 size = comm.Get_size()
 rank = comm.Get_rank()
 
-deepspeed.init_distributed(dist_backend='nccl')
+deepspeed.init_distributed(dist_backend=get_accelerator().communication_backend_name())
 
 # Change cuda_aware to True to test out CUDA-Aware MPI communication
 backend = MpiBackend(cuda_aware=False)
 
-device = torch.device('cuda', rank % torch.cuda.device_count())
+local_rank = rank % get_accelerator().device_count()
+device = torch.device(get_accelerator().device_name(), local_rank)
 
 
-# A simulated compression function using torch.distributed
+# A simulated compression function using deepspeed.comm
 def torch_sim(a):
     a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
     scale = a.norm() / np.sqrt(a.numel())
@@ -36,8 +39,8 @@ def torch_sim(a):
         [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
     rank = dist.get_rank()
     server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
-    torch.cuda.synchronize()
-    torch.distributed.barrier()
+    get_accelerator().synchronize()
+    dist.barrier()
     return a_server_compressed, worker_error, server_error
 
 
@@ -57,8 +60,7 @@ worker_error = torch.zeros(right_tensor_size, device=device)
 server_error = torch.zeros(right_server_size, device=device)
 
 a_torch, worker_error_torch, server_error_torch = torch_sim(a)
-torch.cuda.empty_cache()
-local_rank = rank % torch.cuda.device_count()
+get_accelerator().empty_cache()
 
 a_after = backend.compressed_allreduce(a, worker_error, server_error, local_rank)
 
diff --git a/tests/onebit/test_mpi_perf.py b/tests/onebit/test_mpi_perf.py
index 6017ec873c21f8e076a257800e703cc4364f4119..dd67fdb615e8744e2777ace97b88c6ef322e4f05 100644
--- a/tests/onebit/test_mpi_perf.py
+++ b/tests/onebit/test_mpi_perf.py
@@ -1,14 +1,14 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from mpi4py import MPI
-import time
 import torch
-import torch.distributed as dist
-import numpy as np
 import deepspeed
 
 from deepspeed.runtime.comm.mpi import MpiBackend
 
 # Configure wall clock timer
 from deepspeed.utils.timer import SynchronizedWallClockTimer
+from deepspeed.accelerator import get_accelerator
 
 from statistics import mean
 
@@ -18,11 +18,12 @@ comm = MPI.COMM_WORLD
 size = comm.Get_size()
 rank = comm.Get_rank()
 
-deepspeed.init_distributed(dist_backend='nccl')
+deepspeed.init_distributed(dist_backend=get_accelerator().communication_backend_name())
 # Change cuda_aware to True to test out CUDA-Aware MPI communication
 backend = MpiBackend(cuda_aware=False)
 
-device = torch.device('cuda', rank % torch.cuda.device_count())
+local_rank = rank % get_accelerator().device_count()
+device = torch.device(get_accelerator().device_name(), local_rank)
 
 tensor_size = 300 * 2**20
 server_size = int(tensor_size / size)
@@ -42,8 +43,6 @@ server_error = torch.zeros(right_server_size, device=device)
 warmup = 10
 iters = 10
 
-local_rank = rank % torch.cuda.device_count()
-
 # Warmup
 for i in range(warmup):
     backend.compressed_allreduce(a, worker_error, server_error, local_rank)
diff --git a/tests/onebit/test_nccl_backend.py b/tests/onebit/test_nccl_backend.py
index 16de37174c100985e4b1d0907cda7331421cc23e..e544865b7685782616e08c3d1ec031c8876f8e67 100644
--- a/tests/onebit/test_nccl_backend.py
+++ b/tests/onebit/test_nccl_backend.py
@@ -1,22 +1,24 @@
-import time
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import torch
-import torch.distributed as dist
+import deepspeed.comm as dist
 import numpy as np
 import argparse
 import deepspeed
 import os
 
 from deepspeed.runtime.comm.nccl import NcclBackend
+from deepspeed.accelerator import get_accelerator
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--local_rank', type=int, default=-1)
 args = parser.parse_args()
 
-deepspeed.init_distributed(dist_backend='nccl')
+deepspeed.init_distributed(dist_backend=get_accelerator().communication_backend_name())
 args.local_rank = int(os.environ['LOCAL_RANK'])
 
-torch.cuda.set_device(args.local_rank)
-device = torch.device("cuda", args.local_rank)
+get_accelerator().set_device(args.local_rank)
+device = torch.device(get_accelerator().device_name(), args.local_rank)
 
 size = dist.get_world_size()
 rank = dist.get_rank()
@@ -25,7 +27,7 @@ backend = NcclBackend()
 local_rank = args.local_rank
 
 
-# A simulated compression function using torch.distributed
+# A simulated compression function using deepspeed.comm
 def torch_sim(a):
     a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
     scale = a.norm() / np.sqrt(a.numel())
@@ -42,8 +44,8 @@ def torch_sim(a):
         [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
     rank = dist.get_rank()
     server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
-    torch.cuda.synchronize()
-    torch.distributed.barrier()
+    get_accelerator().synchronize()
+    dist.barrier()
     return a_server_compressed, worker_error, server_error
 
 
@@ -63,7 +65,7 @@ worker_error = torch.zeros(right_tensor_size, device=device)
 server_error = torch.zeros(right_server_size, device=device)
 
 a_torch, worker_error_torch, server_error_torch = torch_sim(a)
-torch.cuda.empty_cache()
+get_accelerator().empty_cache()
 
 a_after = backend.compressed_allreduce(a, worker_error, server_error, local_rank)
 
diff --git a/tests/onebit/test_nccl_perf.py b/tests/onebit/test_nccl_perf.py
index 1374cda4ddce3fa7c865f2ded1f6b3e1c72949e0..aab93efac85179be569b8d72c28e6a79ba77efaf 100644
--- a/tests/onebit/test_nccl_perf.py
+++ b/tests/onebit/test_nccl_perf.py
@@ -1,6 +1,7 @@
-import time
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import torch
-import torch.distributed as dist
+import deepspeed.comm as dist
 import numpy as np
 import argparse
 import deepspeed
@@ -8,6 +9,7 @@ import os
 
 from deepspeed.runtime.comm.nccl import NcclBackend
 from deepspeed.utils.timer import SynchronizedWallClockTimer
+from deepspeed.accelerator import get_accelerator
 from statistics import mean
 
 timers = SynchronizedWallClockTimer()
@@ -16,11 +18,11 @@ parser = argparse.ArgumentParser()
 parser.add_argument('--local_rank', type=int, default=-1)
 args = parser.parse_args()
 
-deepspeed.init_distributed(dist_backend='nccl')
+deepspeed.init_distributed(dist_backend=get_accelerator().communication_backend_name())
 args.local_rank = int(os.environ['LOCAL_RANK'])
 
-torch.cuda.set_device(args.local_rank)
-device = torch.device("cuda", args.local_rank)
+get_accelerator().set_device(args.local_rank)
+device = torch.device(get_accelerator().device_name(), args.local_rank)
 
 size = dist.get_world_size()
 rank = dist.get_rank()
@@ -62,7 +64,7 @@ print("Shape of the compressed buffer:", a_compressed.shape) if rank == 0 else N
 for i in range(iters):
     timers('compressed_allreduce').start()
     backend.compressed_allreduce(a, worker_error, server_error, local_rank)
-    #torch.distributed.all_reduce(a_compressed)
+    #deepspeed.comm.all_reduce(a_compressed)
     timers('compressed_allreduce').stop()
     time_list.append(timers('compressed_allreduce').elapsed())
 
diff --git a/tests/perf/adagrad_test.py b/tests/perf/adagrad_test.py
new file mode 100755
index 0000000000000000000000000000000000000000..37ca85ed47d8b62771f01fd52ceef1687d6a4819
--- /dev/null
+++ b/tests/perf/adagrad_test.py
@@ -0,0 +1,35 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad
+import time
+
+NUM_ITERS = 100
+
+
+def _test_perf(param, optimizer_func):
+    optimizer = optimizer_func(param)
+    avg = 0
+    for i in range(NUM_ITERS):
+        for i, p in enumerate(param):
+            p.grad = torch.ones_like(p) * 2
+        start = time.time()
+        optimizer.step()
+        stop = time.time()
+        avg += (stop - start)
+
+    return avg / NUM_ITERS
+
+
+def _main():
+    device = 'cpu'
+    model_size = 1 * 1024**3
+    group_size = [model_size, 274432]
+    param = [torch.nn.Parameter(torch.ones(size, device=device)) for size in group_size]
+    torch_time = _test_perf(param, torch.optim.Adagrad)
+    ds_time = _test_perf(param, DeepSpeedCPUAdagrad)
+    #print(f"Step time: {torch_time=} {ds_time=}")
+    print("Step time: {torch_time=%s} {ds_time=%s}" %(torch_time, ds_time))
+
+
+_main()
diff --git a/tests/perf/adam_test.py b/tests/perf/adam_test.py
old mode 100644
new mode 100755
index 1ddcd44bbdd49843f41d0a4c353daf88f8f34d07..0c83bfa62984ddf398c88f3a49f34703155f9fa5
--- a/tests/perf/adam_test.py
+++ b/tests/perf/adam_test.py
@@ -1,24 +1,35 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import torch
 from deepspeed.ops.adam import DeepSpeedCPUAdam
 import time
 
-device = 'cpu'
-model_size = 1 * 1024**3
-group_size = [model_size, 274432]
-
-param = [torch.nn.Parameter(torch.ones(size, device=device)) for size in group_size]
-optimizer = DeepSpeedCPUAdam(param)
-#torch.set_num_threads(128)
-for i, p in enumerate(param):
-    p.grad = torch.ones(group_size[i], device=device)
-#param.grad = torch.ones(model_size, device=device)
-avg = 0
-for i in range(100):
-    start = time.time()
-    optimizer.step()
-    stop = time.time()
-    avg += (stop - start)
-    for i, p in enumerate(param):
-        p.grad = torch.ones(group_size[i], device=device) * 2
-    #param.grad = torch.ones(model_size, device=device) * 2
-print("Elapsed Time is ", avg / 100)
+NUM_ITERS = 100
+
+
+def _test_perf(param, optimizer_func):
+    optimizer = optimizer_func(param)
+    avg = 0
+    for i in range(NUM_ITERS):
+        for i, p in enumerate(param):
+            p.grad = torch.ones_like(p) * 2
+        start = time.time()
+        optimizer.step()
+        stop = time.time()
+        avg += (stop - start)
+
+    return avg / NUM_ITERS
+
+
+def _main():
+    device = 'cpu'
+    model_size = 1 * 1024**3
+    group_size = [model_size, 274432]
+    param = [torch.nn.Parameter(torch.ones(size, device=device)) for size in group_size]
+    torch_time = _test_perf(param, torch.optim.Adam)
+    ds_time = _test_perf(param, DeepSpeedCPUAdam)
+    #print(f"Step time: {torch_time=} {ds_time=}")
+    print("Step time: {torch_time=%s} {ds_time=%s}" %(torch_time, ds_time))
+
+
+_main()
diff --git a/tests/perf/adam_test1.py b/tests/perf/adam_test1.py
old mode 100644
new mode 100755
index 88f1a1c5961df8d385336f5b223f891086687314..13d486d4d855826b22dc05221a8b71be1a977660
--- a/tests/perf/adam_test1.py
+++ b/tests/perf/adam_test1.py
@@ -1,13 +1,17 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import torch
 from deepspeed.ops.adam import DeepSpeedCPUAdam
 import time
+from deepspeed.accelerator import get_accelerator
 
 device = 'cpu'
 model_size = 1 * 1024**3
 param = torch.nn.Parameter(torch.ones(model_size, device=device))
-param_fp16 = torch.nn.Parameter(torch.ones(model_size,
-                                           dtype=torch.half,
-                                           device='cuda:0'))
+param_fp16 = torch.nn.Parameter(
+    torch.ones(model_size,
+               dtype=torch.half,
+               device=get_accelerator().device_name(0)))
 
 optimizer = DeepSpeedCPUAdam([param])
 #torch.set_num_threads(128)
diff --git a/tests/pytest.ini b/tests/pytest.ini
new file mode 100644
index 0000000000000000000000000000000000000000..08b666867b79f11a43abed1b802ae1cf125fb7e0
--- /dev/null
+++ b/tests/pytest.ini
@@ -0,0 +1,8 @@
+[pytest]
+addopts = -m "not sequential and not nightly and not inference and not seq_inference and not inference_ops"
+markers =
+    sequential:Tests that need to be run sequentially
+    inference:Inference model tests
+    inference_ops:Individual inference operator tests
+    seq_inference:Inference model tests to run sequentially
+    nightly:Tests that should be run nightly
diff --git a/tests/small_model_debugging/stage3_test.py b/tests/small_model_debugging/stage3_test.py
index 5eb1e7d6c14c10c3f24c6a990c98f4ee947bd6f1..ca85c00be486bcd24057a725a97dac72d8d59b46 100644
--- a/tests/small_model_debugging/stage3_test.py
+++ b/tests/small_model_debugging/stage3_test.py
@@ -1,3 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import torch
 
 import deepspeed
diff --git a/tests/small_model_debugging/test.py b/tests/small_model_debugging/test.py
index 25418f3c0f9362ccab6fc1e5e2b8f3d386fae3e8..a97792df56ac1d51273b57814b988eaa6e8b4df1 100644
--- a/tests/small_model_debugging/test.py
+++ b/tests/small_model_debugging/test.py
@@ -1,8 +1,9 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import torch
 from deepspeed.pt.deepspeed_linear import LinearModuleForZeroStage3
-from deepspeed.pt.deepspeed_utils import see_memory_usage
 from deepspeed.pt.log_utils import logger
-import deepspeed
+from deepspeed.accelerator import get_accelerator
 
 
 def see_memory_usage(message):
@@ -11,37 +12,42 @@ def see_memory_usage(message):
     logger.info(message)
     logger.info(
         "Memory Allocated %s GigaBytes ",
-        torch.cuda.memory_allocated() / (1024 * 1024 * 1024),
+        get_accelerator().memory_allocated() / (1024 * 1024 * 1024),
     )
     logger.info(
         "Max Memory Allocated %s GigaBytes",
-        torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),
+        get_accelerator().max_memory_allocated() / (1024 * 1024 * 1024),
     )
     logger.info(
         "Cache Allocated %s GigaBytes",
-        torch.cuda.memory_cached() / (1024 * 1024 * 1024),
+        get_accelerator().memory_cached() / (1024 * 1024 * 1024),
     )
     logger.info(
         "Max cache Allocated %s GigaBytes",
-        torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
+        get_accelerator().max_memory_cached() / (1024 * 1024 * 1024),
     )
 
 
-tens = torch.rand(1024, 16384, dtype=torch.half, device=torch.device('cuda'))
+tens = torch.rand(1024,
+                  16384,
+                  dtype=torch.half,
+                  device=torch.device(get_accelerator().device_name()))
 tens_back = tens.detach().clone()
 
 #linear_bk = torch.nn.functional.linear
 #torch.nn.functional.linear = deepspeed.pt.deepspeed_linear.LinearFunctionForZeroStage3.apply
 model = LinearModuleForZeroStage3(16384, 16384)
 
-model.cuda().half()
+model.to(get_accelerator().device_name()).half()
 
 see_memory_usage("Before forward")
 y = model(tens)
 
 see_memory_usage("After forward")
 
-model.weight.data = torch.zeros(1, dtype=torch.half, device=torch.device('cuda'))
+model.weight.data = torch.zeros(1,
+                                dtype=torch.half,
+                                device=torch.device(get_accelerator().device_name()))
 
 see_memory_usage("After weight zero")
 
diff --git a/tests/small_model_debugging/test_model.py b/tests/small_model_debugging/test_model.py
old mode 100644
new mode 100755
index c957bf8f1ecbfb30e441d37435a7182b5c2f51f6..792d683ce47b7e3e03e8a3cd0941afb2564c6a85
--- a/tests/small_model_debugging/test_model.py
+++ b/tests/small_model_debugging/test_model.py
@@ -1,9 +1,12 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import os
 import json
 import argparse
 import torch
 import deepspeed
 from torch.utils.data.distributed import DistributedSampler
+import deepspeed.comm as dist
 
 
 class SimpleModel(torch.nn.Module):
@@ -56,7 +59,7 @@ def get_args(tmpdir, config_dict):
 
 
 def print0(msg):
-    if torch.distributed.get_rank() == 0:
+    if dist.get_rank() == 0:
         print(msg, flush=True)
 
 
@@ -95,7 +98,7 @@ model, _, _,_ = deepspeed.initialize(args=args,
 
 
 def print_params(tag, model):
-    if torch.distributed.get_rank() == 0:
+    if dist.get_rank() == 0:
         for n, p in model.named_parameters():
             print0("{} {}:{}".format(tag, n, p))
 
@@ -107,7 +110,7 @@ data_loader = get_data_loader(model=model,
 #print_params('pre-train', model)
 for n, batch in enumerate(data_loader):
     loss = model(batch[0], batch[1])
-    if torch.distributed.get_rank() == 0:
+    if dist.get_rank() == 0:
         print("LOSS:", loss.item())
     model.backward(loss)
     model.step()
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..fcb45ab2b68516814a4bfbffebf2e01cbfefd527 100644
--- a/tests/unit/__init__.py
+++ b/tests/unit/__init__.py
@@ -0,0 +1 @@
+'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/tests/unit/alexnet_model.py b/tests/unit/alexnet_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdbaf02922e210cb47c46f17ef9da046ecf27ff1
--- /dev/null
+++ b/tests/unit/alexnet_model.py
@@ -0,0 +1,164 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import pytest
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+import deepspeed.comm as dist
+import deepspeed.runtime.utils as ds_utils
+from deepspeed.accelerator import get_accelerator
+from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec
+
+
+class AlexNet(nn.Module):
+    def __init__(self, num_classes=10):
+        super(AlexNet, self).__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(3,
+                      64,
+                      kernel_size=11,
+                      stride=4,
+                      padding=5),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=2,
+                         stride=2),
+            nn.Conv2d(64,
+                      192,
+                      kernel_size=5,
+                      padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=2,
+                         stride=2),
+            nn.Conv2d(192,
+                      384,
+                      kernel_size=3,
+                      padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(384,
+                      256,
+                      kernel_size=3,
+                      padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256,
+                      256,
+                      kernel_size=3,
+                      padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=2,
+                         stride=2),
+        )
+        self.classifier = nn.Linear(256, num_classes)
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def forward(self, x, y):
+        x = self.features(x)
+        x = x.view(x.size(0), -1)
+        x = self.classifier(x)
+        return self.loss_fn(x, y)
+
+
+class AlexNetPipe(AlexNet):
+    def to_layers(self):
+        layers = [*self.features, lambda x: x.view(x.size(0), -1), self.classifier]
+        return layers
+
+
+class AlexNetPipeSpec(PipelineModule):
+    def __init__(self, num_classes=10, **kwargs):
+        self.num_classes = num_classes
+        specs = [
+            LayerSpec(nn.Conv2d, 3, 64, kernel_size=11, stride=4, padding=5),
+            LayerSpec(nn.ReLU, inplace=True),
+            LayerSpec(nn.MaxPool2d, kernel_size=2, stride=2),
+            LayerSpec(nn.Conv2d, 64, 192, kernel_size=5, padding=2),
+            F.relu,
+            LayerSpec(nn.MaxPool2d, kernel_size=2, stride=2),
+            LayerSpec(nn.Conv2d, 192, 384, kernel_size=3, padding=1),
+            F.relu,
+            LayerSpec(nn.Conv2d, 384, 256, kernel_size=3, padding=1),
+            F.relu,
+            LayerSpec(nn.Conv2d, 256, 256, kernel_size=3, padding=1),
+            F.relu,
+            LayerSpec(nn.MaxPool2d, kernel_size=2, stride=2),
+
+            lambda x: x.view(x.size(0), -1),
+            LayerSpec(nn.Linear, 256, self.num_classes), # classifier
+        ]
+        super().__init__(layers=specs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
+
+
+# Define this here because we cannot pickle local lambda functions
+def cast_to_half(x):
+    return x.half()
+
+
+def cifar_trainset(fp16=False):
+    torchvision = pytest.importorskip("torchvision", minversion="0.5.0")
+    import torchvision.transforms as transforms
+
+    transform_list = [
+        transforms.ToTensor(),
+        transforms.Normalize((0.5,
+                              0.5,
+                              0.5),
+                             (0.5,
+                              0.5,
+                              0.5)),
+    ]
+    if fp16:
+        transform_list.append(torchvision.transforms.Lambda(cast_to_half))
+
+    transform = transforms.Compose(transform_list)
+
+    local_rank = get_accelerator().current_device()
+
+    # Only one rank per machine downloads.
+    dist.barrier()
+    if local_rank != 0:
+        dist.barrier()
+    trainset = torchvision.datasets.CIFAR10(root='/blob/cifar10-data',
+                                            train=True,
+                                            download=True,
+                                            transform=transform)
+    if local_rank == 0:
+        dist.barrier()
+    return trainset
+
+
+def train_cifar(model,
+                config,
+                num_steps=400,
+                average_dp_losses=True,
+                fp16=True,
+                seed=123):
+    with get_accelerator().random().fork_rng(
+            devices=[get_accelerator().current_device_name()]):
+        ds_utils.set_random_seed(seed)
+
+        # disable dropout
+        model.eval()
+
+        trainset = cifar_trainset(fp16=fp16)
+        config['local_rank'] = dist.get_rank()
+
+        engine, _, _, _ = deepspeed.initialize(
+            config=config,
+            model=model,
+            model_parameters=[p for p in model.parameters()],
+            training_data=trainset)
+
+        losses = []
+        for step in range(num_steps):
+            loss = engine.train_batch()
+            losses.append(loss.item())
+            if step % 50 == 0 and dist.get_rank() == 0:
+                print(f'STEP={step} LOSS={loss.item()}')
+
+        if average_dp_losses:
+            loss_tensor = torch.tensor(losses).to(get_accelerator().device_name())
+            dist.all_reduce(loss_tensor)
+            loss_tensor /= dist.get_world_size()
+            losses = loss_tensor.tolist()
+
+    return losses
diff --git a/tests/unit/autotuning/test_autotuning.py b/tests/unit/autotuning/test_autotuning.py
new file mode 100644
index 0000000000000000000000000000000000000000..90b9c5b3a2c8dca9a7fed7df884cd4bcce389c37
--- /dev/null
+++ b/tests/unit/autotuning/test_autotuning.py
@@ -0,0 +1,86 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
+import pytest
+from unit.simple_model import create_config_from_dict
+from deepspeed.launcher import runner as dsrun
+from deepspeed.autotuning.autotuner import Autotuner
+from deepspeed.autotuning.scheduler import ResourceManager
+
+RUN_OPTION = 'run'
+TUNE_OPTION = 'tune'
+
+
+def test_command_line():
+    '''Validate handling of command line arguments'''
+    for opt in [RUN_OPTION, TUNE_OPTION]:
+        dsrun.parse_args(
+            args=f"--num_nodes 1 --num_gpus 1 --autotuning {opt} foo.py".split())
+
+    for error_opts in [
+            "--autotuning --num_nodes 1 --num_gpus 1 foo.py".split(),
+            "--autotuning test --num_nodes 1 -- num_gpus 1 foo.py".split(),
+            "--autotuning".split()
+    ]:
+        with pytest.raises(SystemExit):
+            dsrun.parse_args(args=error_opts)
+
+
+@pytest.mark.parametrize("arg_mappings",
+                        [
+                            None,
+                            {
+                            },
+                            {
+                                "train_micro_batch_size_per_gpu": "--per_device_train_batch_size"
+                            },
+                            {
+                                "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+                                "gradient_accumulation_steps": "--gradient_accumulation_steps"
+                            },
+                            {
+                                "train_batch_size": "-tbs"
+                            }
+                        ]) # yapf: disable
+def test_resource_manager_arg_mappings(arg_mappings):
+    rm = ResourceManager(args=None,
+                         hosts="worker-0, worker-1",
+                         num_gpus_per_node=4,
+                         results_dir=None,
+                         exps_dir=None,
+                         arg_mappings=arg_mappings)
+
+    if arg_mappings is not None:
+        for k, v in arg_mappings.items():
+            assert k.strip() in rm.arg_mappings.keys()
+            assert arg_mappings[k.strip()].strip() == rm.arg_mappings[k.strip()]
+
+
+@pytest.mark.parametrize("active_resources",
+                        [
+                           {"worker-0": [0, 1, 2, 3]},
+                           {"worker-0": [0, 1, 2, 3], "worker-1": [0, 1, 2, 3]},
+                           {"worker-0": [0], "worker-1": [0, 1, 2], "worker-2": [0, 1, 2]},
+                           {"worker-0": [0, 1], "worker-2": [4, 5]}
+                        ]
+                        ) # yapf: disable
+def test_autotuner_resources(tmpdir, active_resources):
+    config_dict = {
+        "autotuning": {
+            "enabled": True,
+            "exps_dir": os.path.join(tmpdir,
+                                     'exps_dir'),
+            "arg_mappings": {}
+        }
+    }
+    config_path = create_config_from_dict(tmpdir, config_dict)
+    args = dsrun.parse_args(
+        args=f'--autotuning {TUNE_OPTION} foo.py --deepspeed_config {config_path}'.split(
+        ))
+    tuner = Autotuner(args=args, active_resources=active_resources)
+
+    expected_num_nodes = len(list(active_resources.keys()))
+    assert expected_num_nodes == tuner.exp_num_nodes
+
+    expected_num_gpus = min([len(v) for v in active_resources.values()])
+    assert expected_num_gpus == tuner.exp_num_gpus
diff --git a/tests/unit/checkpoint/common.py b/tests/unit/checkpoint/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b89d6811b0120aaab2d9a031608c9d62e1aba69
--- /dev/null
+++ b/tests/unit/checkpoint/common.py
@@ -0,0 +1,220 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
+import torch
+import numbers
+
+import deepspeed
+from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer
+from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
+from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
+from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
+
+from unit.simple_model import *
+
+
+def compare_deepspeed_states(saved_model, loaded_model):
+    # These are compared in more depth in other places
+    assert hasattr(loaded_model, 'module')
+
+    assert saved_model.sparse_tensor_module_names == loaded_model.sparse_tensor_module_names
+    assert saved_model.skipped_steps == loaded_model.skipped_steps
+    assert saved_model.global_steps == loaded_model.global_steps
+
+
+def compare_model_states(saved_model,
+                         loaded_model,
+                         compare_optimizer=True,
+                         load_module_only=False):
+    if not load_module_only:
+        compare_deepspeed_states(saved_model, loaded_model)
+
+    for p0, p1 in zip(saved_model.module.named_parameters(), loaded_model.module.named_parameters()):
+        np0, p0 = p0
+        np1, p1 = p1
+        if 'deepspeed_moe.gate.wg' in np0:
+            # these params are converted to float at runtime, cast to half for comparison
+            p1 = p1.half()
+            p0 = p0.half()
+        assert id(p0) != id(p1), f'Comparing fp16 model state tensor against itself : {id(p0)} <====> {id(p1)}'
+        try:
+            assert torch.allclose(p0, p1, atol=1e-07), f"FP16 model state {p0} is not equal to {p1}, names:{np0}, {np1}"
+        except RuntimeError as err:
+            print(f"FP16 model state {p0} is not equal to {p1}, names:{np0}, {np1}")
+            raise err
+
+    if not compare_optimizer:
+        return
+
+    if DeepSpeedZeroOptimizer_Stage3 is not None and isinstance(
+            saved_model.optimizer,
+            DeepSpeedZeroOptimizer_Stage3):
+        for p0, p1 in zip(saved_model.optimizer.fp32_partitioned_groups_flat, loaded_model.optimizer.fp32_partitioned_groups_flat):
+            assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
+
+    elif isinstance(saved_model.optimizer, DeepSpeedZeroOptimizer):
+        for p0, p1 in zip(saved_model.optimizer.single_partition_of_fp32_groups, loaded_model.optimizer.single_partition_of_fp32_groups):
+            assert id(p0) != id(p1), f'Comparing fp32 model state tensor against itself: {id(p0)} <====> {id(p1)}'
+            assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
+
+    elif isinstance(saved_model.optimizer, FP16_Optimizer):
+        for p0, p1 in zip(saved_model.optimizer.fp32_groups_flat, loaded_model.optimizer.fp32_groups_flat):
+            assert id(p0) != id(p1), f'Comparing fp32 model state tensor against itself: {id(p0)} <====> {id(p1)}'
+            assert torch.allclose(p0, p1, atol=1e-07), f"FP32 model states {p0} is not equal to {p1}"
+
+    elif isinstance(saved_model.optimizer, FP16_UnfusedOptimizer):
+        for params0, params1 in zip(saved_model.optimizer.fp32_groups, loaded_model.optimizer.fp32_groups):
+            for p0, p1 in zip(params0, params1):
+                assert id(p0) != id(p1), f'Comparing fp32 model state tensor against itself: {id(p0)} <====> {id(p1)}'
+                assert torch.allclose(p0, p1, atol=1e-07), f"FP32 model states {p0} is not equal to {p1}"
+    elif isinstance(saved_model.optimizer, torch.optim.Optimizer):
+        pass
+    else:
+        assert False, f'Unexpected Optimizer Type: {saved_model.optimizer}'
+
+
+def compare_state_dicts(state0, state1, expected_mismatch_keys=[]):
+    for (k0, s0), (k1, s1) in zip(state0.items(), state1.items()):
+        assert k0 == k1, f'failure due to key mismatch {k0} != {k1}'
+        if k0 in expected_mismatch_keys:
+            continue
+        if isinstance(s0, torch.Tensor) and isinstance(s1, torch.Tensor):
+            assert id(s0) != id(s1), f'Comparing optimizer state tensor against itself: {id(s0)} <====> {id(s1)}'
+            assert torch.equal(s0.to('cpu'), s1.to('cpu'))
+        else:
+            assert s0 == s1, f'failures with keys = {k0}, {k1}, values = {type(s0[0])} and {type(s1[0])}'
+
+
+def compare_optimizer_states(saved_model, loaded_model, hidden_dim, fp16=True):
+    saved_optimizer = saved_model.optimizer.optimizer if fp16 else saved_model.optimizer
+    loaded_optimizer = loaded_model.optimizer.optimizer if fp16 else loaded_model.optimizer
+
+    for state0, state1 in zip(saved_optimizer.state.values(),
+                              loaded_optimizer.state.values()):
+        compare_state_dicts(state0, state1)
+
+
+def compare_lr_scheduler_states(saved_model, loaded_model):
+    assert hasattr(saved_model, 'lr_scheduler')
+    assert hasattr(loaded_model, 'lr_scheduler')
+
+    saved_scheduler = saved_model.lr_scheduler
+    loaded_scheduler = loaded_model.lr_scheduler
+
+    assert hasattr(saved_scheduler, 'state_dict')
+    assert hasattr(loaded_scheduler, 'state_dict')
+
+    saved_sd = saved_scheduler.state_dict()
+    loaded_sd = loaded_scheduler.state_dict()
+
+    print(f"saved_sd = {saved_sd}")
+    print(f"loaded_sd = {loaded_sd}")
+
+    assert saved_sd.keys() == loaded_sd.keys()
+
+    for state0, state1 in zip(saved_sd.values(), loaded_sd.values()):
+        if isinstance(state0, numbers.Number) and isinstance(state1, numbers.Number):
+            assert state0 == state1
+
+
+# following mixture-of-experts.md
+def create_moe_param_groups(model):
+    from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer
+
+    parameters = {'params': [p for p in model.parameters()], 'name': 'parameters'}
+    return split_params_into_different_moe_groups_for_optimizer(parameters)
+
+
+def create_deepspeed_model(config_dict, model, base_optimizer):
+    ds_model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                             model=model,
+                                             model_parameters=create_moe_param_groups(model),
+                                             optimizer=base_optimizer)
+    return ds_model
+
+
+def checkpoint_correctness_verification(config_dict,
+                                        models,
+                                        hidden_dim,
+                                        tmpdir,
+                                        load_optimizer_states=False,
+                                        load_lr_scheduler_states=False,
+                                        fp16=True,
+                                        train_batch=False,
+                                        base_optimizers=[None,
+                                                         None],
+                                        empty_tag=False,
+                                        seq_dataloader=False,
+                                        load_module_only=False):
+    dtype = torch.half if fp16 else torch.float32
+    ds_model = create_deepspeed_model(config_dict=config_dict,
+                                      model=models[0],
+                                      base_optimizer=base_optimizers[0])
+
+    if seq_dataloader:
+        data_loader = sequence_dataloader(model=ds_model,
+                                          total_samples=50,
+                                          hidden_dim=hidden_dim,
+                                          device=ds_model.device,
+                                          dtype=dtype)
+    else:
+        data_loader = random_dataloader(model=ds_model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=ds_model.device,
+                                        dtype=dtype)
+
+    if train_batch:
+        ds_model.set_dataloader(data_loader)
+        for _, batch in enumerate(data_loader):
+            loss = ds_model.train_batch()
+    else:
+        for _, batch in enumerate(data_loader):
+            loss = ds_model(batch[0], batch[1])
+            ds_model.backward(loss)
+            ds_model.step()
+
+    trained_model = ds_model
+
+    save_folder = os.path.join(tmpdir, 'saved_checkpoint')
+    save_tag = None if empty_tag else '1'
+
+    trained_model.save_checkpoint(save_folder, tag=save_tag)
+
+    dist.barrier()
+
+    for root, _, files in os.walk(save_folder):
+        for f in files:
+            if "_expert_" in f and "_model_states" in f:
+                expert = torch.load(os.path.join(root, f))
+                needed, storages = 0, {}
+                for name, tensor in expert.items():
+                    needed += tensor.size().numel()
+                    storage = tensor.storage()
+                    # some storage can be shared within an expert's checkpoint
+                    storages[storage.data_ptr()] = storage.size()
+                stored = sum(v for _, v in storages.items())
+                assert needed == stored, f"MoE expert checkpoint uses more storage than required: {f}"
+
+    loaded_model = create_deepspeed_model(config_dict=config_dict,
+                                          model=models[1],
+                                          base_optimizer=base_optimizers[1])
+    assert list(trained_model.parameters())[0].dtype == list(
+        loaded_model.parameters())[0].dtype
+
+    loaded_model.load_checkpoint(save_folder,
+                                 tag=save_tag,
+                                 load_optimizer_states=load_optimizer_states,
+                                 load_lr_scheduler_states=load_lr_scheduler_states,
+                                 load_module_only=load_module_only)
+
+    compare_model_states(trained_model,
+                         loaded_model,
+                         compare_optimizer=load_optimizer_states,
+                         load_module_only=load_module_only)
+
+    if load_optimizer_states:
+        compare_optimizer_states(trained_model, loaded_model, hidden_dim, fp16)
+
+    if load_lr_scheduler_states:
+        compare_lr_scheduler_states(trained_model, loaded_model)
diff --git a/tests/unit/checkpoint/test_latest_checkpoint.py b/tests/unit/checkpoint/test_latest_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..955edfdec3ac31ce63b3104fa1d7ddd4af3bdf91
--- /dev/null
+++ b/tests/unit/checkpoint/test_latest_checkpoint.py
@@ -0,0 +1,53 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import deepspeed
+
+from unit.common import DistributedTest
+from unit.simple_model import *
+
+from unit.checkpoint.common import checkpoint_correctness_verification
+
+
+class TestLatestCheckpoint(DistributedTest):
+    world_size = 1
+
+    def test_existing_latest(self, tmpdir):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            }
+        }
+        hidden_dim = 10
+        models = [SimpleModel(hidden_dim=hidden_dim) for _ in range(2)]
+        checkpoint_correctness_verification(config_dict=config_dict,
+                                            models=models,
+                                            hidden_dim=hidden_dim,
+                                            tmpdir=tmpdir,
+                                            load_optimizer_states=True,
+                                            load_lr_scheduler_states=False,
+                                            fp16=False,
+                                            empty_tag=True)
+
+    def test_missing_latest(self, tmpdir):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            }
+        }
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+        model, _, _,_ = deepspeed.initialize(config=config_dict,
+                                            model=model,
+                                            model_parameters=model.parameters())
+        # should be no-op, since latest doesn't exist
+        model.load_checkpoint(tmpdir)
diff --git a/tests/unit/checkpoint/test_lr_scheduler.py b/tests/unit/checkpoint/test_lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6a8f5ebdd4ab21452a4b40bacd0b24e5be5d1fa
--- /dev/null
+++ b/tests/unit/checkpoint/test_lr_scheduler.py
@@ -0,0 +1,122 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import deepspeed
+from deepspeed.ops.op_builder import CPUAdamBuilder
+
+from unit.common import DistributedTest
+from unit.simple_model import *
+
+from unit.checkpoint.common import checkpoint_correctness_verification
+
+import pytest
+
+
+@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+                         [(0,
+                           False),
+                          (1,
+                           False),
+                          (2,
+                           False),
+                          (2,
+                           True),
+                          (3,
+                           False),
+                          (3,
+                           True)])
+class TestLRSchedulerCheckpoint(DistributedTest):
+    world_size = 2
+
+    def test_checkpoint_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
+        if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+            pytest.skip("cpu-adam is not compatible")
+
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": 'Adam',
+                "params": {
+                    "lr": 0.00015,
+                    "betas": [0.8,
+                              0.999],
+                    "eps": 1e-8,
+                    "weight_decay": 3e-7
+                }
+            },
+            "fp16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+                "cpu_offload": use_cpu_offload
+            },
+            "scheduler": {
+                "type": "WarmupLR",
+                "params": {
+                    "warmup_min_lr": 0,
+                    "warmup_max_lr": 0.001,
+                    "warmup_num_steps": 1000
+                }
+            }
+        }
+        hidden_dim = 10
+
+        if zero_stage == 3:
+            global DeepSpeedZeroOptimizer_Stage3
+            from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
+            with deepspeed.zero.Init():
+                models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
+        else:
+            models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
+
+        checkpoint_correctness_verification(config_dict,
+                                            models,
+                                            hidden_dim,
+                                            tmpdir,
+                                            load_optimizer_states=False,
+                                            load_lr_scheduler_states=True)
+
+    def test_checkpoint_no_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
+        if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+            pytest.skip("cpu-adam is not compatible")
+
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": 'Adam',
+                "params": {
+                    "lr": 1e-5
+                }
+            },
+            "fp16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+                "cpu_offload": use_cpu_offload
+            },
+            "scheduler": {
+                "type": "WarmupLR",
+                "params": {
+                    "warmup_min_lr": 0,
+                    "warmup_max_lr": 0.001,
+                    "warmup_num_steps": 1000
+                }
+            },
+        }
+        hidden_dim = 10
+
+        if zero_stage == 3:
+            with deepspeed.zero.Init():
+                models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
+        else:
+            models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
+
+        checkpoint_correctness_verification(config_dict,
+                                            models,
+                                            hidden_dim,
+                                            tmpdir,
+                                            load_optimizer_states=False,
+                                            load_lr_scheduler_states=False)
diff --git a/tests/unit/checkpoint/test_moe_checkpoint.py b/tests/unit/checkpoint/test_moe_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..edce2959aa203485b910a86b2bc452b86f1b078f
--- /dev/null
+++ b/tests/unit/checkpoint/test_moe_checkpoint.py
@@ -0,0 +1,109 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer
+
+from unit.common import DistributedTest
+from unit.simple_model import *
+from unit.util import required_torch_version
+
+from unit.checkpoint.common import checkpoint_correctness_verification
+
+import pytest
+
+
+class TestMoECheckpoint(DistributedTest):
+    world_size = 4
+
+    @pytest.mark.parametrize("ep_size", [4])
+    def test_checkpoint_moe(self, tmpdir, ep_size):
+        if not required_torch_version():
+            pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
+
+        config_dict = {
+            "train_batch_size": 8,
+            "steps_per_print": 1,
+            "fp16": {
+                "enabled": True
+            }
+        }
+        hidden_dim = 16
+
+        models = [
+            SimpleMoEModel(hidden_dim=hidden_dim,
+                           num_experts=ep_size,
+                           ep_size=ep_size) for _ in range(2)
+        ]
+        optimizers = [torch.optim.AdamW(params=model.parameters()) for model in models]
+        checkpoint_correctness_verification(config_dict,
+                                            models=models,
+                                            hidden_dim=hidden_dim,
+                                            tmpdir=tmpdir,
+                                            load_optimizer_states=True,
+                                            load_lr_scheduler_states=False,
+                                            fp16=config_dict["fp16"]["enabled"],
+                                            empty_tag=True,
+                                            base_optimizers=optimizers,
+                                            seq_dataloader=True)
+
+    @pytest.mark.parametrize("ep_size, load_optim_states",
+                             [(4,
+                               True),
+                              (4,
+                               False),
+                              (2,
+                               True),
+                              (2,
+                               False)])
+    def test_checkpoint_moe_and_zero(self, tmpdir, ep_size, load_optim_states):
+        if not required_torch_version():
+            pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
+
+        config_dict = {
+            "train_batch_size": 8,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": 'Adam',
+                "params": {
+                    "lr": 0.00015,
+                    "betas": [0.8,
+                              0.999],
+                    "eps": 1e-8,
+                    "weight_decay": 3e-7
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "zero_optimization": {
+                "stage": 2,
+            }
+        }
+        hidden_dim = 16
+
+        models = [
+            SimpleMoEModel(hidden_dim=hidden_dim,
+                           num_experts=ep_size,
+                           ep_size=ep_size) for _ in range(2)
+        ]
+        # param group must have a random unique name (for now)
+        # TODO: clean-up this requirement, the unique name should not be required here
+        param_groups = [{
+            'params': [p for p in model.parameters()],
+            'name': 'random-unique-name'
+        } for model in models]
+        params = [
+            split_params_into_different_moe_groups_for_optimizer(group)
+            for group in param_groups
+        ]
+        optimizers = [torch.optim.AdamW(params=param) for param in params]
+        checkpoint_correctness_verification(config_dict,
+                                            models=models,
+                                            hidden_dim=hidden_dim,
+                                            tmpdir=tmpdir,
+                                            load_optimizer_states=load_optim_states,
+                                            load_lr_scheduler_states=False,
+                                            fp16=config_dict["fp16"]["enabled"],
+                                            empty_tag=True,
+                                            base_optimizers=optimizers,
+                                            seq_dataloader=True)
diff --git a/tests/unit/checkpoint/test_other_optimizer.py b/tests/unit/checkpoint/test_other_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09157a2c80d8b4e82a72a886a85c2bafb7fcc55
--- /dev/null
+++ b/tests/unit/checkpoint/test_other_optimizer.py
@@ -0,0 +1,132 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import deepspeed
+from deepspeed.ops.op_builder import FusedLambBuilder
+
+from unit.common import DistributedTest
+from unit.simple_model import *
+
+from unit.checkpoint.common import checkpoint_correctness_verification
+
+import pytest
+
+
+class TestOtherOptimizerCheckpoint(DistributedTest):
+    world_size = 2
+
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME],
+                        reason="lamb is not compatible")
+    def test_checkpoint_unfused_optimizer(self, tmpdir):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Lamb",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": True
+            },
+            "scheduler": {
+                "type": "OneCycle",
+                "params": {
+                    "cycle_first_step_size": 1000,
+                    "cycle_first_stair_count": 500,
+                    "cycle_second_step_size": 1000,
+                    "cycle_second_stair_count": 500,
+                    "decay_step_size": 1000,
+                    "cycle_min_lr": 0.0001,
+                    "cycle_max_lr": 0.0010,
+                    "decay_lr_rate": 0.001,
+                    "cycle_min_mom": 0.85,
+                    "cycle_max_mom": 0.99,
+                    "decay_mom_rate": 0.0
+                }
+            }
+        }
+
+        args = args_from_dict(tmpdir, config_dict)
+        hidden_dim = 10
+        models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
+
+        # Load & verify optimizer states
+        checkpoint_correctness_verification(config_dict,
+                                            models=models,
+                                            hidden_dim=hidden_dim,
+                                            tmpdir=tmpdir,
+                                            load_optimizer_states=True)
+
+        # Ignore optimizer states
+        checkpoint_correctness_verification(config_dict,
+                                            models=models,
+                                            hidden_dim=hidden_dim,
+                                            tmpdir=tmpdir,
+                                            load_optimizer_states=False)
+
+    def test_checkpoint_fused_optimizer(self, tmpdir):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015,
+                    "betas": [0.8,
+                              0.999],
+                    "eps": 1e-8,
+                    "weight_decay": 3e-7
+                }
+            },
+            "fp16": {
+                "enabled": True
+            }
+        }
+
+        args = args_from_dict(tmpdir, config_dict)
+        hidden_dim = 10
+        models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
+
+        # Load & verify optimizer states
+        checkpoint_correctness_verification(config_dict,
+                                            models=models,
+                                            hidden_dim=hidden_dim,
+                                            tmpdir=tmpdir,
+                                            load_optimizer_states=True)
+
+        # Ignore optimizer states
+        checkpoint_correctness_verification(config_dict,
+                                            models=models,
+                                            hidden_dim=hidden_dim,
+                                            tmpdir=tmpdir,
+                                            load_optimizer_states=False)
+
+    def test_checkpoint_fp32_optimizer(self, tmpdir):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015,
+                    "betas": [0.8,
+                              0.999],
+                    "eps": 1e-8,
+                    "weight_decay": 3e-7
+                }
+            },
+            "fp16": {
+                "enabled": False
+            }
+        }
+
+        args = args_from_dict(tmpdir, config_dict)
+        hidden_dim = 10
+        models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
+        checkpoint_correctness_verification(config_dict,
+                                            models=models,
+                                            hidden_dim=hidden_dim,
+                                            tmpdir=tmpdir,
+                                            fp16=False)
diff --git a/tests/unit/checkpoint/test_pipeline.py b/tests/unit/checkpoint/test_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..c698798fa96564b5f82dcff937fe2046073aa8ce
--- /dev/null
+++ b/tests/unit/checkpoint/test_pipeline.py
@@ -0,0 +1,109 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from deepspeed.runtime.checkpoint_engine.torch_checkpoint_engine import TorchCheckpointEngine
+from unit.common import DistributedTest
+from unit.simple_model import *
+
+from unit.checkpoint.common import checkpoint_correctness_verification
+
+import pytest
+
+
+class TestPipelineCheckpoint(DistributedTest):
+    world_size = 4
+
+    @pytest.mark.parametrize("zero_stage", [0, 1])
+    def test_checkpoint_pipe_engine(self, zero_stage, tmpdir):
+        config_dict = {
+            "train_batch_size": 2,
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-5
+                }
+            },
+            "zero_optimization": {
+                "stage": zero_stage
+            },
+            "fp16": {
+                "enabled": zero_stage > 0
+            },
+            "scheduler": {
+                "type": "OneCycle",
+                "params": {
+                    "cycle_first_step_size": 1000,
+                    "cycle_first_stair_count": 500,
+                    "cycle_second_step_size": 1000,
+                    "cycle_second_stair_count": 500,
+                    "decay_step_size": 1000,
+                    "cycle_min_lr": 0.0001,
+                    "cycle_max_lr": 0.0010,
+                    "decay_lr_rate": 0.001,
+                    "cycle_min_mom": 0.85,
+                    "cycle_max_mom": 0.99,
+                    "decay_mom_rate": 0.0
+                }
+            }
+        }
+
+        models = [LinearStackPipe(num_stages=2) for _ in range(2)]
+        checkpoint_correctness_verification(config_dict=config_dict,
+                                            models=models,
+                                            hidden_dim=models[0].hidden_dim,
+                                            tmpdir=tmpdir,
+                                            fp16=config_dict['fp16']['enabled'],
+                                            load_optimizer_states=True,
+                                            load_lr_scheduler_states=True,
+                                            train_batch=True)
+
+    @pytest.mark.parametrize(
+        "base_topo,test_topo",
+        [
+            #(PipeTopo(num_pp=1,
+            #          num_dp=4),
+            # PipeTopo(num_pp=4,
+            #          num_dp=1)),
+            #(PipeTopo(num_pp=2,
+            #          num_dp=2),
+            # PipeTopo(num_pp=2,
+            #          num_dp=2)),
+            #(PipeTopo(num_pp=4,
+            #          num_dp=1),
+            # PipeTopo(num_pp=2,
+            #          num_dp=2)),
+        ])
+    def test_checkpoint_pipe_module(self, base_topo, test_topo, tmpdir):
+        checkpoint_engine = TorchCheckpointEngine()
+        base_model = LinearStackPipe(topology=base_topo)
+        base_model.save_state_dict(tmpdir, checkpoint_engine=checkpoint_engine)
+
+        dist.barrier()
+
+        test_model = LinearStackPipe(topology=test_topo)
+        test_model.load_state_dir(tmpdir, checkpoint_engine=checkpoint_engine)
+
+        # Base and test can have different lengths, so make sure we map from the
+        # smaller to larger model
+        if len(base_model.forward_funcs) < len(test_model.forward_funcs):
+            A = base_model
+            B = test_model
+        else:
+            A = test_model
+            B = base_model
+
+        # Compare layers individually since partitions are different
+        for idx, A_layer in enumerate(A.forward_funcs):
+            if not hasattr(A_layer, 'parameters'):
+                # Skip functionals, etc.
+                continue
+
+            # Find the corresponding layer in B
+            global_idx = idx + A._local_start
+            B_local_idx = global_idx - B._local_start
+            B_layer = B.forward_funcs[B_local_idx]
+
+            # Compare layer parameters
+            for p0, p1 in zip(A_layer.parameters(), B_layer.parameters()):
+                assert torch.allclose(p0, p1, atol=1e-07), f"Model state {p0} is not equal to {p1}"
diff --git a/tests/unit/checkpoint/test_reshape_checkpoint.py b/tests/unit/checkpoint/test_reshape_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9ae854521bab48e113b8ba90c67d8e5551fe2e7
--- /dev/null
+++ b/tests/unit/checkpoint/test_reshape_checkpoint.py
@@ -0,0 +1,57 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from deepspeed.checkpoint import model_3d_desc
+
+
+def _do_reshape(src_3d, tgt_3d):
+    assert src_3d.can_reshape(tgt_3d)
+    new_3d_map = src_3d.reshape(tgt_3d)
+
+    assert len(new_3d_map) == tgt_3d.dp_degree
+    for new_2d_map in new_3d_map:
+        assert new_2d_map.pp_degree == tgt_3d.pp_degree
+        assert new_2d_map.tp_degree == tgt_3d.tp_degree
+
+    return new_3d_map
+
+
+# Specify 3d shape as pp/tp/dp
+def test_reshape_222_to_111():
+    src_3d = model_3d_desc(pp_degree=2, tp_degree=2, dp_degree=2)
+    tgt_3d = model_3d_desc(pp_degree=1, tp_degree=1, dp_degree=1)
+
+    new_3d_map = _do_reshape(src_3d, tgt_3d)
+
+    assert new_3d_map[0].get_data(pp_index=0, tp_index=0) == [0, 4, 1, 5, 2, 6, 3, 7]
+
+
+def test_reshape_222_to_121():
+    src_3d = model_3d_desc(pp_degree=2, tp_degree=2, dp_degree=2)
+    tgt_3d = model_3d_desc(pp_degree=1, tp_degree=2, dp_degree=1)
+
+    new_3d_map = _do_reshape(src_3d, tgt_3d)
+
+    assert new_3d_map[0].get_data(pp_index=0, tp_index=0) == [0, 4, 2, 6]
+    assert new_3d_map[0].get_data(pp_index=0, tp_index=1) == [1, 5, 3, 7]
+
+
+def test_reshape_222_to_122():
+    src_3d = model_3d_desc(pp_degree=2, tp_degree=2, dp_degree=2)
+    tgt_3d = model_3d_desc(pp_degree=1, tp_degree=2, dp_degree=2)
+
+    new_3d_map = _do_reshape(src_3d, tgt_3d)
+
+    assert new_3d_map[0].get_data(pp_index=0, tp_index=0) == [0, 4]
+    assert new_3d_map[0].get_data(pp_index=0, tp_index=1) == [1, 5]
+    assert new_3d_map[1].get_data(pp_index=0, tp_index=0) == [2, 6]
+    assert new_3d_map[1].get_data(pp_index=0, tp_index=1) == [3, 7]
+
+
+def test_reshape_222_to_211():
+    src_3d = model_3d_desc(pp_degree=2, tp_degree=2, dp_degree=2)
+    tgt_3d = model_3d_desc(pp_degree=2, tp_degree=1, dp_degree=1)
+
+    new_3d_map = _do_reshape(src_3d, tgt_3d)
+
+    assert new_3d_map[0].get_data(pp_index=0, tp_index=0) == [0, 4, 1, 5]
+    assert new_3d_map[0].get_data(pp_index=1, tp_index=0) == [2, 6, 3, 7]
diff --git a/tests/unit/checkpoint/test_sparse.py b/tests/unit/checkpoint/test_sparse.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f07acebc058f24e6d26ff25749190f334c380c9
--- /dev/null
+++ b/tests/unit/checkpoint/test_sparse.py
@@ -0,0 +1,96 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import deepspeed
+
+from unit.common import DistributedTest
+from unit.simple_model import *
+
+import pytest
+
+
+class TestSparseCheckpoint(DistributedTest):
+    world_size = 2
+
+    @pytest.mark.parametrize(["to_save_model_has_embedding",
+                              "to_save_model_sparse"],
+                             [
+                                 [False,
+                                  False],
+                                 [True,
+                                  False],
+                                 [True,
+                                  True],
+                             ])
+    @pytest.mark.parametrize(["destination_has_embedding",
+                              "destination_sparse"],
+                             [
+                                 [False,
+                                  False],
+                                 [True,
+                                  False],
+                                 [True,
+                                  True],
+                             ])
+    def test_non_strict_load_sparse(self,
+                                    tmpdir,
+                                    to_save_model_has_embedding,
+                                    to_save_model_sparse,
+                                    destination_has_embedding,
+                                    destination_sparse):
+        class ModelNoEmbedding(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(3, 1)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        class ModelEmbedding(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.emb = torch.nn.Embedding(10, 3)
+                self.linear = torch.nn.Linear(3, 1)
+
+            def forward(self, x, offsets):
+                return self.linear(self.emb(x, offsets))
+
+        if to_save_model_has_embedding:
+            model_to_save = ModelEmbedding()
+        else:
+            model_to_save = ModelNoEmbedding()
+        if destination_has_embedding:
+            model_destination = ModelEmbedding()
+        else:
+            model_destination = ModelNoEmbedding()
+
+        engine_to_save, _, _, _ = deepspeed.initialize(
+            model=model_to_save, config={"train_batch_size": 2, "sparse_gradients": to_save_model_sparse}
+        )
+        engine_destination, _, _, _ = deepspeed.initialize(
+            model=model_destination, config={"train_batch_size": 2, "sparse_gradients": destination_sparse}
+        )
+
+        save_folder = os.path.join(tmpdir, 'saved_checkpoint')
+        save_tag = '1'
+
+        engine_to_save.save_checkpoint(save_folder, tag=save_tag)
+
+        is_sparse_destination = isinstance(model_destination,
+                                           ModelEmbedding) and destination_sparse
+        if isinstance(model_destination,
+                      ModelEmbedding) and model_destination.emb.sparse:
+            assert "emb.weight" in engine_destination.sparse_tensor_module_names
+        engine_destination.load_checkpoint(save_folder,
+                                           tag=save_tag,
+                                           load_module_strict=False,
+                                           load_optimizer_states=False,
+                                           load_lr_scheduler_states=False,
+                                           load_module_only=False)
+        if isinstance(model_destination,
+                      ModelEmbedding) and isinstance(model_to_save,
+                                                     ModelEmbedding):
+            assert engine_destination.sparse_tensor_module_names == engine_to_save.sparse_tensor_module_names
+        elif isinstance(model_destination, ModelEmbedding):
+            assert not is_sparse_destination or "emb.weight" in engine_destination.sparse_tensor_module_names
+        else:
+            assert len(engine_destination.sparse_tensor_module_names) == 0
diff --git a/tests/unit/checkpoint/test_tag_validation.py b/tests/unit/checkpoint/test_tag_validation.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9489622305d60e72a49e355979550e5616004d1
--- /dev/null
+++ b/tests/unit/checkpoint/test_tag_validation.py
@@ -0,0 +1,63 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import deepspeed
+
+from unit.common import DistributedTest
+from unit.simple_model import *
+
+import pytest
+
+
+class TestCheckpointValidationTag(DistributedTest):
+    world_size = 2
+
+    @pytest.mark.parametrize('valid_mode', ["FAIL", "WARN", "IGNORE"])
+    def test_checkpoint_unique_tag(self, tmpdir, valid_mode):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "checkpoint": {
+                "tag_validation": valid_mode
+            }
+        }
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+
+        model, _, _,_ = deepspeed.initialize(config=config_dict,
+                                            model=model,
+                                            model_parameters=model.parameters())
+        if valid_mode == "FAIL":
+            with pytest.raises(AssertionError):
+                model.save_checkpoint(save_dir=tmpdir, tag=f"tag-{dist.get_rank()}")
+        else:
+            model.save_checkpoint(save_dir=tmpdir, tag=f"tag-{dist.get_rank()}")
+
+    def test_checkpoint_unknown_tag_validation(self, tmpdir):
+
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "checkpoint": {
+                "tag_validation": "foo"
+            }
+        }
+        hidden_dim = 10
+        args = args_from_dict(tmpdir, config_dict)
+        model = SimpleModel(hidden_dim)
+
+        with pytest.raises(deepspeed.DeepSpeedConfigError):
+            model, _, _,_ = deepspeed.initialize(config=config_dict,
+                                                model=model,
+                                                model_parameters=model.parameters())
diff --git a/tests/unit/checkpoint/test_zero_optimizer.py b/tests/unit/checkpoint/test_zero_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7de8e9bff90825d69eb67e5afe6fdfdbdeb800b7
--- /dev/null
+++ b/tests/unit/checkpoint/test_zero_optimizer.py
@@ -0,0 +1,460 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import deepspeed
+from deepspeed.ops.op_builder import CPUAdamBuilder
+
+from unit.common import DistributedTest, DistributedFixture
+from unit.simple_model import *
+from unit.util import required_minimum_torch_version
+
+from unit.checkpoint.common import *
+
+import pytest
+
+
+class TestZeROCheckpoint(DistributedTest):
+    world_size = 2
+
+    @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer',
+                             [(1,
+                               False,
+                               'Adam'),
+                              (2,
+                               False,
+                               'Adam'),
+                              (2,
+                               True,
+                               'deepspeed_adam'),
+                              (3,
+                               False,
+                               'Adam'),
+                              (3,
+                               True,
+                               'deepspeed_adam')])
+    def test_load_optimizer_state(self,
+                                  tmpdir,
+                                  zero_stage,
+                                  use_cpu_offload,
+                                  adam_optimizer):
+        if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+            pytest.skip("cpu-adam is not compatible")
+
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": 'Adam',
+                "params": {
+                    "lr": 0.00015,
+                    "betas": [0.8,
+                              0.999],
+                    "eps": 1e-8,
+                    "weight_decay": 3e-7
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "wall_clock_breakdown": True,
+            "zero_optimization": {
+                "stage": zero_stage,
+                "cpu_offload": use_cpu_offload
+            }
+        }
+        hidden_dim = 10
+
+        if zero_stage == 3:
+            with deepspeed.zero.Init():
+                models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
+        else:
+            models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
+
+        checkpoint_correctness_verification(config_dict,
+                                            models,
+                                            hidden_dim,
+                                            tmpdir,
+                                            load_optimizer_states=True)
+
+    @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer',
+                             [(1,
+                               False,
+                               "Adam"),
+                              (2,
+                               False,
+                               "Adam"),
+                              (2,
+                               True,
+                               'deepspeed_adam'),
+                              (3,
+                               False,
+                               'Adam'),
+                              (3,
+                               True,
+                               'deepspeed_adam')])
+    def test_not_load_optimizer_state(self,
+                                      tmpdir,
+                                      zero_stage,
+                                      use_cpu_offload,
+                                      adam_optimizer):
+        if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+            pytest.skip("cpu-adam is not compatible")
+
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": 'Adam',
+                "params": {
+                    "lr": 0.00015,
+                    "betas": [0.8,
+                              0.999],
+                    "eps": 1e-8,
+                    "weight_decay": 3e-7
+                }
+            },
+            "fp16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+                "cpu_offload": use_cpu_offload
+            }
+        }
+        hidden_dim = 10
+
+        if zero_stage == 3:
+            global DeepSpeedZeroOptimizer_Stage3
+            from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
+            with deepspeed.zero.Init():
+                models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
+        else:
+            models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
+
+        checkpoint_correctness_verification(config_dict,
+                                            models,
+                                            hidden_dim,
+                                            tmpdir,
+                                            load_optimizer_states=False)
+
+    @pytest.mark.parametrize('zero_stage', [1, 2])
+    def test_hybrid_optimizer_state(self, tmpdir, zero_stage):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 2,
+            "gradient_accumulation_steps": 2,
+            "steps_per_print": 1,
+            "zero_optimization": {
+                "stage": zero_stage
+            },
+            "zero_allow_untested_optimizer": True,
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            }
+        }
+        hidden_dim = 10
+        models = [SimpleModel(hidden_dim=hidden_dim) for _ in range(2)]
+        optimizers = [HybridStateOptimizer(model.parameters()) for model in models]
+
+        checkpoint_correctness_verification(config_dict,
+                                            models=models,
+                                            base_optimizers=optimizers,
+                                            hidden_dim=hidden_dim,
+                                            tmpdir=tmpdir,
+                                            load_optimizer_states=True)
+
+    @pytest.mark.parametrize('zero_stage', [0, 1, 2, 3])
+    def test_load_module_only(self, tmpdir, zero_stage):
+        config_dict = {
+            "train_batch_size": 2,
+            "optimizer": {
+                "type": 'Adam'
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            }
+        }
+        hidden_dim = 10
+
+        if zero_stage == 3:
+            with deepspeed.zero.Init():
+                models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
+        else:
+            models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
+
+        checkpoint_correctness_verification(config_dict,
+                                            models,
+                                            hidden_dim,
+                                            tmpdir,
+                                            load_module_only=True)
+
+
+class ws4_model_checkpoint(DistributedFixture):
+    world_size = 4
+
+    def run(self, class_tmpdir, elastic_save, load_optim):
+        ds_config = {
+            "train_batch_size": 4,
+            "optimizer": {
+                "type": 'Adam'
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "zero_optimization": {
+                "stage": 2,
+                "elastic_checkpoint": elastic_save
+            }
+        }
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+
+        model, _, _, _ = deepspeed.initialize(config=ds_config,
+                                            model=model,
+                                            model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=8,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+        if load_optim:
+            torch.save(model.optimizer.optimizer.state_dict(),
+                       os.path.join(class_tmpdir,
+                                    'opt-state-dict'))
+        model.save_checkpoint(class_tmpdir)
+
+
+@pytest.mark.parametrize("elastic_save", [True, False])
+@pytest.mark.parametrize("elastic_load", [True, False])
+@pytest.mark.parametrize("load_optim", [True, False])
+class TestZeROElasticCheckpoint(DistributedTest):
+    world_size = 2
+
+    def test_elastic_checkpoint_fixed_dp(self,
+                                         tmpdir,
+                                         elastic_save,
+                                         elastic_load,
+                                         load_optim):
+        ds_config = {
+            "train_batch_size": 2,
+            "optimizer": {
+                "type": 'Adam'
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "zero_optimization": {
+                "stage": 2,
+                "elastic_checkpoint": elastic_save
+            }
+        }
+        hidden_dim = 10
+
+        # torch 1.2.* stores raw tensor id numbers in checkpoint state which leads to
+        # false positive mismatches in checkpoint state comparisons.
+        # Newer torch versions store tensor ids as 0, 1, 2, ...
+        expected_mismatch_keys = [] if required_minimum_torch_version(1,
+                                                                      4) else ['params']
+        models = [SimpleModel(hidden_dim) for _ in range(2)]
+        model, _, _, _ = deepspeed.initialize(config=ds_config,
+                                            model=models[0],
+                                            model_parameters=models[0].parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=8,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+        if load_optim:
+            torch.save(model.optimizer.optimizer.state_dict(),
+                       os.path.join(tmpdir,
+                                    'opt-state-dict'))
+        model.save_checkpoint(tmpdir)
+
+        ds_config["zero_optimization"]["elastic_checkpoint"] = elastic_load
+        model, _, _, _ = deepspeed.initialize(config=ds_config,
+                                            model=models[1],
+                                            model_parameters=models[1].parameters())
+        model.load_checkpoint(tmpdir, load_optimizer_states=load_optim)
+
+        if load_optim:
+            saved_sd = torch.load(os.path.join(tmpdir, 'opt-state-dict'))
+            curr_sd = model.optimizer.optimizer.state_dict()
+            for curr_param_group, saved_param_group in zip(curr_sd['param_groups'], saved_sd['param_groups']):
+                compare_state_dicts(curr_param_group,
+                                    saved_param_group,
+                                    expected_mismatch_keys)
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=8,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    def test_elastic_checkpoint_change_dp(self,
+                                          ws4_model_checkpoint,
+                                          class_tmpdir,
+                                          elastic_save,
+                                          elastic_load,
+                                          load_optim):
+        ds_config = {
+            "train_batch_size": 4,
+            "optimizer": {
+                "type": 'Adam'
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "zero_optimization": {
+                "stage": 2,
+                "elastic_checkpoint": elastic_load
+            }
+        }
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+
+        # Load checkpoint with dp world size = 2
+        model, _, _, _ = deepspeed.initialize(config=ds_config,
+                                                model=model,
+                                                model_parameters=model.parameters())
+        if load_optim:
+            with pytest.raises(deepspeed.runtime.zero.utils.ZeRORuntimeException):
+                model.load_checkpoint(class_tmpdir, load_optimizer_states=load_optim)
+        else:
+            model.load_checkpoint(class_tmpdir, load_optimizer_states=load_optim)
+
+
+class TestZeROSaveLoadEdgeCase(DistributedTest):
+    world_size = 2
+
+    @pytest.mark.parametrize('zero_stage', [0, 1, 2, 3])
+    def test_immediate_save_load(self, tmpdir, zero_stage):
+        config_dict = {
+            "train_batch_size": 4,
+            "optimizer": {
+                "type": 'Adam'
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            }
+        }
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+
+        ds_model = create_deepspeed_model(config_dict=config_dict,
+                                          model=model,
+                                          base_optimizer=None)
+        ds_model.save_checkpoint(tmpdir)
+        ds_model.load_checkpoint(tmpdir,
+                                 load_optimizer_states=False,
+                                 load_lr_scheduler_states=False,
+                                 load_module_only=False)
+
+    @pytest.mark.parametrize('zero_stage', [0, 1, 2, 3])
+    def test_load_immediate_save(self, tmpdir, zero_stage):
+        config_dict = {
+            "train_batch_size": 4,
+            "optimizer": {
+                "type": 'Adam'
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            }
+        }
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+
+        # 1. pretrain a model and save it
+        dtype = torch.half
+        ds_model = create_deepspeed_model(config_dict=config_dict,
+                                          model=model,
+                                          base_optimizer=None)
+        data_loader = random_dataloader(model=ds_model,
+                                        total_samples=1,
+                                        hidden_dim=hidden_dim,
+                                        device=ds_model.device,
+                                        dtype=dtype)
+        for _, batch in enumerate(data_loader):
+            loss = ds_model(batch[0], batch[1])
+            ds_model.backward(loss)
+            ds_model.step()
+        ds_model.save_checkpoint(tmpdir)
+
+        # 2. load and immediately save a model with a fresh ds engine
+        ds_model = create_deepspeed_model(config_dict=config_dict,
+                                          model=model,
+                                          base_optimizer=None)
+        ds_model.load_checkpoint(tmpdir,
+                                 load_optimizer_states=False,
+                                 load_lr_scheduler_states=False,
+                                 load_module_only=False)
+        ds_model.save_checkpoint(tmpdir)
+
+    @pytest.mark.parametrize('zero_stage', [0, 1, 2, 3])
+    def test_save_before_accum_grad_is_done(self, tmpdir, zero_stage):
+        config_dict = {
+            "optimizer": {
+                "type": 'Adam'
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+                "stage3_gather_fp16_weights_on_model_save": True,
+            },
+            "gradient_accumulation_steps": 2,
+            "train_micro_batch_size_per_gpu": 1,
+            "train_batch_size": 4,
+        }
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+
+        # This test reproduces a bug where one tries to retrieve a 16bit model before grad_accum
+        # cycle was completed.
+        # So we config grad_accum=2 and step only once and save_16bit_model
+        ds_model = create_deepspeed_model(config_dict=config_dict,
+                                          model=model,
+                                          base_optimizer=None)
+
+        data_loader = random_dataloader(model=ds_model,
+                                        total_samples=2,
+                                        hidden_dim=hidden_dim,
+                                        device=ds_model.device,
+                                        dtype=torch.half)
+
+        batch = next(iter(data_loader))
+        loss = ds_model(batch[0], batch[1])
+        ds_model.backward(loss)
+        ds_model.step()
+
+        # we stepped only once, and now save 16bit model before gradient_accumulation_steps=2 is complete
+        ds_model.save_16bit_model(tmpdir, "model.pt")
+
+        # let's test just as well that we can save the checkpoint too
+        ds_model.save_checkpoint(tmpdir)
diff --git a/tests/unit/comm/test_dist.py b/tests/unit/comm/test_dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..6005c926f793b73ed61219c39e0de194d9755ee6
--- /dev/null
+++ b/tests/unit/comm/test_dist.py
@@ -0,0 +1,200 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
+import torch
+import deepspeed.comm as dist
+import deepspeed
+
+from unit.common import DistributedTest, DistributedFixture, get_master_port
+from unit.simple_model import SimpleModel
+from deepspeed.accelerator import get_accelerator
+
+import pytest
+
+
+class TestInit(DistributedTest):
+    world_size = 3
+
+    def test(self):
+        assert dist.is_initialized()
+        assert dist.get_world_size() == 3
+        assert dist.get_rank() < 3
+
+
+# Demonstration of pytest's parameterization and fixtures
+@pytest.fixture(params=["hello"])
+def greeting(request):
+    return request.param
+
+
+@pytest.mark.parametrize("number,color", [(1138, "purple")])
+class TestDistArgs(DistributedTest):
+    world_size = 2
+    """ Classes that use DistributedTest class must define a test* method """
+    @pytest.mark.parametrize("shape", ["icosahedron"])
+    def test(self, number, color, shape, greeting):
+        """Ensure that we can parse args to DistributedTest methods. """
+        assert dist.get_world_size() == 2
+        assert number == 1138
+        assert color == "purple"
+        assert shape == "icosahedron"
+        assert greeting == "hello"
+
+
+# Demonstration of distributed tests grouped in single class
+@pytest.mark.parametrize("number", [1138])
+class TestGroupedDistTest(DistributedTest):
+    world_size = 2
+
+    def test_one(self, number):
+        assert dist.get_world_size() == 2
+        assert number == 1138
+
+    def test_two(self, number, color="purple"):
+        assert dist.get_world_size() == 2
+        assert number == 1138
+        assert color == "purple"
+
+
+# Demonstration of world_size override
+class TestWorldSizeOverrideDistTest(DistributedTest):
+    world_size = 2
+
+    def test_world_size_2(self):
+        assert dist.get_world_size() == 2
+
+    @pytest.mark.world_size(1)
+    def test_world_size_1(self):
+        assert dist.get_world_size() == 1
+
+
+# Demonstration of the DistributedFixture class
+@pytest.fixture(params=[2, 4])
+def val1(request):
+    return request.param
+
+
+@pytest.fixture(params=[16, 32])
+def val2(request):
+    return request.param
+
+
+class distributed_fixture(DistributedFixture):
+    world_size = 2
+
+    def run(self, class_tmpdir, val1, val2):
+        assert int(os.environ["WORLD_SIZE"]) == self.world_size
+        local_rank = os.environ["LOCAL_RANK"]
+        file_path = os.path.join(class_tmpdir, f"checkpoint-{local_rank}.pt")
+        with open(file_path, "w") as f:
+            f.write(f"{local_rank},{val1},{val2}")
+
+
+class TestDistributedFixture(DistributedTest):
+    world_size = 1
+
+    def test(self, distributed_fixture, class_tmpdir, val1, val2):
+        for rank in range(2):
+            file_path = os.path.join(class_tmpdir, f"checkpoint-{rank}.pt")
+            with open(file_path, "r") as f:
+                chkpt = f.read()
+            assert chkpt == f"{rank},{val1},{val2}"
+        assert int(os.environ["WORLD_SIZE"]) == 1
+
+
+class TestDistAllReduce(DistributedTest):
+    world_size = [1, 2, 4]
+
+    def test(self):
+        x = torch.ones(1, 3).to(get_accelerator().device_name()) * (dist.get_rank() + 1)
+        sum_of_ranks = (dist.get_world_size() * (dist.get_world_size() + 1)) // 2
+        result = torch.ones(1, 3).to(get_accelerator().device_name()) * sum_of_ranks
+        dist.all_reduce(x)
+        assert torch.all(x == result)
+
+
+@pytest.mark.parametrize("dist_init_required", [True, False, None])
+class TestDistInit(DistributedTest):
+    init_distributed = False
+
+    def test_already_init(self, dist_init_required):
+        torch.distributed.init_process_group(
+            get_accelerator().communication_backend_name())
+        deepspeed.init_distributed(get_accelerator().communication_backend_name(),
+                                   dist_init_required=dist_init_required)
+
+    def test_no_init(self, dist_init_required):
+        if dist_init_required or dist_init_required is None:
+            deepspeed.init_distributed(get_accelerator().communication_backend_name(),
+                                       dist_init_required=dist_init_required)
+        else:
+            # torch.dist is not done and for some reason the user says they don't want it done
+            with pytest.raises(Exception):
+                deepspeed.init_distributed(
+                    get_accelerator().communication_backend_name(),
+                    dist_init_required=dist_init_required)
+
+
+class TestDistInitNoEnv(DistributedTest):
+    world_size = 1
+    init_distributed = False
+    set_dist_env = False
+
+    def test(self):
+        torch.distributed.init_process_group(
+            backend=get_accelerator().communication_backend_name(),
+            init_method=f"tcp://127.0.0.1:{get_master_port()}",
+            world_size=1,
+            rank=0)
+        assert torch.distributed.is_initialized()
+        deepspeed.init_distributed(get_accelerator().communication_backend_name(),
+                                   auto_mpi_discovery=True)
+
+
+@pytest.mark.parametrize("dist_init_required", [True, False])
+class TestDistInitWithModel(DistributedTest):
+    init_distributed = False
+
+    def test_already_init(self, dist_init_required):
+        torch.distributed.init_process_group(
+            get_accelerator().communication_backend_name())
+        model = SimpleModel(4)
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {}
+            }
+        }
+        engine, *_ = deepspeed.initialize(
+            model=model,
+            config=config_dict,
+            model_parameters=model.parameters(),
+            dist_init_required=dist_init_required
+        )
+
+    def test_no_init(self, dist_init_required):
+        model = SimpleModel(4)
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {}
+            }
+        }
+        if dist_init_required:
+            engine, *_ = deepspeed.initialize(
+                model=model,
+                config=config_dict,
+                model_parameters=model.parameters(),
+                dist_init_required=dist_init_required
+            )
+        else:
+            # torch.dist is not done and for some reason the user says they don't want it done
+            with pytest.raises(Exception):
+                engine, *_ = deepspeed.initialize(
+                    model=model,
+                    config=config_dict,
+                    model_parameters=model.parameters(),
+                    dist_init_required=dist_init_required
+                )
diff --git a/tests/unit/common.py b/tests/unit/common.py
index 57ed50f17cea7b04fa5cb7b9931cff4568ef271d..35e8f3983072f0d858b15c855d554035cf0e8be9 100644
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -1,22 +1,28 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import os
 import time
+import inspect
+from abc import ABC, abstractmethod
+from pathlib import Path
 
 import torch
-import torch.distributed as dist
-from torch.multiprocessing import Process
-
+import torch.multiprocessing as mp
 import deepspeed
+from deepspeed.accelerator import get_accelerator
+import deepspeed.comm as dist
+from torch.multiprocessing import Process
 
 import pytest
-from functools import wraps
-import unittest
-from pathlib import Path
-
-from pathlib import Path
+from _pytest.outcomes import Skipped
+from _pytest.fixtures import FixtureLookupError, FixtureFunctionMarker
 
 # Worker timeout *after* the first worker has completed.
 DEEPSPEED_UNIT_WORKER_TIMEOUT = 120
 
+# Worker timeout for tests that hang
+DEEPSPEED_TEST_TIMEOUT = 600
+
 
 def get_xdist_worker_id():
     xdist_worker = os.environ.get('PYTEST_XDIST_WORKER', None)
@@ -34,23 +40,36 @@ def get_master_port():
     return master_port
 
 
-def set_cuda_visibile():
+def set_accelerator_visible():
     cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", None)
     xdist_worker_id = get_xdist_worker_id()
     if xdist_worker_id is None:
         xdist_worker_id = 0
     if cuda_visible is None:
-        # CUDA_VISIBLE_DEVICES is not set, discover it from nvidia-smi instead
+        # CUDA_VISIBLE_DEVICES is not set, discover it using accelerator specific command instead
         import subprocess
-        is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None
-        if is_rocm_pytorch:
-            rocm_smi = subprocess.check_output(['rocm-smi', '--showid'])
-            gpu_ids = filter(lambda s: 'GPU' in s,
-                             rocm_smi.decode('utf-8').strip().split('\n'))
-            num_gpus = len(list(gpu_ids))
+        if get_accelerator().device_name() == 'cuda':
+            is_rocm_pytorch = hasattr(torch.version,
+                                      'hip') and torch.version.hip is not None
+            if is_rocm_pytorch:
+                rocm_smi = subprocess.check_output(['rocm-smi', '--showid'])
+                gpu_ids = filter(lambda s: 'GPU' in s,
+                                 rocm_smi.decode('utf-8').strip().split('\n'))
+                num_gpus = len(list(gpu_ids))
+            else:
+                nvidia_smi = subprocess.check_output(['nvidia-smi', '--list-gpus'])
+                num_gpus = len(nvidia_smi.decode('utf-8').strip().split('\n'))
         else:
-            nvidia_smi = subprocess.check_output(['nvidia-smi', '--list-gpus'])
-            num_gpus = len(nvidia_smi.decode('utf-8').strip().split('\n'))
+            assert get_accelerator().device_name() == 'xpu'
+            import re
+            clinfo = subprocess.check_output(['clinfo'])
+            lines = clinfo.decode('utf-8').strip().split('\n')
+            num_gpus = 0
+            for line in lines:
+                match = re.search('Device Type.*GPU', line)
+                if match:
+                    num_gpus += 1
+
         cuda_visible = ",".join(map(str, range(num_gpus)))
 
     # rotate list based on xdist worker id, example below
@@ -63,26 +82,102 @@ def set_cuda_visibile():
     os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(dev_id_list)
 
 
-def distributed_test(world_size=2, backend='nccl'):
-    """A decorator for executing a function (e.g., a unit test) in a distributed manner.
-    This decorator manages the spawning and joining of processes, initialization of
-    torch.distributed, and catching of errors.
-
-    Usage example:
-        @distributed_test(worker_size=[2,3])
-        def my_test():
-            rank = dist.get_rank()
-            world_size = dist.get_world_size()
-            assert(rank < world_size)
-
-    Arguments:
-        world_size (int or list): number of ranks to spawn. Can be a list to spawn
-        multiple tests.
+class DistributedExec(ABC):
+    """
+    Base class for distributed execution of functions/methods. Contains common
+    methods needed for DistributedTest and DistributedFixture.
     """
-    def dist_wrap(run_func):
-        """Second-level decorator for dist_test. This actually wraps the function. """
-        def dist_init(local_rank, num_procs, *func_args, **func_kwargs):
-            """Initialize torch.distributed and execute the user function. """
+    world_size = 2
+    backend = get_accelerator().communication_backend_name()
+    init_distributed = True
+    set_dist_env = True
+    requires_cuda_env = True
+
+    @abstractmethod
+    def run(self):
+        ...
+
+    def __call__(self, request=None):
+        self._fixture_kwargs = self._get_fixture_kwargs(request, self.run)
+        world_size = self.world_size
+        if self.requires_cuda_env and not get_accelerator().is_available():
+            pytest.skip("only supported in accelerator environments.")
+
+        if isinstance(world_size, int):
+            world_size = [world_size]
+        for procs in world_size:
+            self._launch_procs(procs)
+            time.sleep(0.5)
+
+    def _get_fixture_kwargs(self, request, func):
+        if not request:
+            return {}
+        # Grab fixture / parametrize kwargs from pytest request object
+        fixture_kwargs = {}
+        params = inspect.getfullargspec(func).args
+        params.remove("self")
+        for p in params:
+            try:
+                fixture_kwargs[p] = request.getfixturevalue(p)
+            except FixtureLookupError:
+                pass  # test methods can have kwargs that are not fixtures
+        return fixture_kwargs
+
+    def _launch_procs(self, num_procs):
+        if torch.cuda.is_available() and torch.cuda.device_count() < num_procs:
+            pytest.skip(
+                f"Skipping test because not enough GPUs are available: {num_procs} required, {torch.cuda.device_count()} available"
+            )
+        mp.set_start_method('forkserver', force=True)
+        skip_msg = mp.Queue()  # Allows forked processes to share pytest.skip reason
+        processes = []
+        for local_rank in range(num_procs):
+            p = Process(target=self._dist_init, args=(local_rank, num_procs, skip_msg))
+            p.start()
+            processes.append(p)
+
+        # Now loop and wait for a test to complete. The spin-wait here isn't a big
+        # deal because the number of processes will be O(#GPUs) << O(#CPUs).
+        any_done = False
+        start = time.time()
+        while (not any_done) and ((time.time() - start) < DEEPSPEED_TEST_TIMEOUT):
+            for p in processes:
+                if not p.is_alive():
+                    any_done = True
+                    break
+            time.sleep(.1)  # So we don't hog CPU
+
+        # If we hit the timeout, then presume a test is hanged
+        if not any_done:
+            for p in processes:
+                p.terminate()
+            pytest.exit("Test hanged, exiting", returncode=0)
+
+        # Wait for all other processes to complete
+        for p in processes:
+            p.join(DEEPSPEED_UNIT_WORKER_TIMEOUT)
+
+        failed = [(rank, p) for rank, p in enumerate(processes) if p.exitcode != 0]
+        for rank, p in failed:
+            # If it still hasn't terminated, kill it because it hung.
+            if p.exitcode is None:
+                p.terminate()
+                pytest.fail(f'Worker {rank} hung.', pytrace=False)
+            if p.exitcode < 0:
+                pytest.fail(f'Worker {rank} killed by signal {-p.exitcode}',
+                            pytrace=False)
+            if p.exitcode > 0:
+                pytest.fail(f'Worker {rank} exited with code {p.exitcode}',
+                            pytrace=False)
+
+        if not skip_msg.empty():
+            # This assumed all skip messages are the same, it may be useful to
+            # add a check here to assert all exit messages are equal
+            pytest.skip(skip_msg.get())
+
+    def _dist_init(self, local_rank, num_procs, skip_msg):
+        """Initialize deepspeed.comm and execute the user function. """
+        if self.set_dist_env:
             os.environ['MASTER_ADDR'] = '127.0.0.1'
             os.environ['MASTER_PORT'] = get_master_port()
             os.environ['LOCAL_RANK'] = str(local_rank)
@@ -90,79 +185,179 @@ def distributed_test(world_size=2, backend='nccl'):
             os.environ['RANK'] = str(local_rank)
             os.environ['WORLD_SIZE'] = str(num_procs)
 
-            # turn off NCCL logging if set
-            os.environ.pop('NCCL_DEBUG', None)
+        # turn off NCCL logging if set
+        os.environ.pop('NCCL_DEBUG', None)
 
-            set_cuda_visibile()
+        if get_accelerator().is_available():
+            set_accelerator_visible()
 
-            deepspeed.init_distributed(dist_backend=backend)
+        if self.init_distributed:
+            deepspeed.init_distributed(dist_backend=self.backend)
+            dist.barrier()
 
-            if torch.cuda.is_available():
-                torch.cuda.set_device(local_rank)
+        if get_accelerator().is_available():
+            get_accelerator().set_device(local_rank)
 
-            run_func(*func_args, **func_kwargs)
+        try:
+            self.run(**self._fixture_kwargs)
+        except BaseException as e:
+            if isinstance(e, Skipped):
+                skip_msg.put(e.msg)
+            else:
+                raise e
 
+        if self.init_distributed or dist.is_initialized():
             # make sure all ranks finish at the same time
-            torch.distributed.barrier()
-
+            dist.barrier()
             # tear down after test completes
-            torch.distributed.destroy_process_group()
-
-        def dist_launcher(num_procs, *func_args, **func_kwargs):
-            """Launch processes and gracefully handle failures. """
-
-            # Spawn all workers on subprocesses.
-            processes = []
-            for local_rank in range(num_procs):
-                p = Process(target=dist_init,
-                            args=(local_rank,
-                                  num_procs,
-                                  *func_args),
-                            kwargs=func_kwargs)
-                p.start()
-                processes.append(p)
-
-            # Now loop and wait for a test to complete. The spin-wait here isn't a big
-            # deal because the number of processes will be O(#GPUs) << O(#CPUs).
-            any_done = False
-            while not any_done:
-                for p in processes:
-                    if not p.is_alive():
-                        any_done = True
-                        break
-
-            # Wait for all other processes to complete
-            for p in processes:
-                p.join(DEEPSPEED_UNIT_WORKER_TIMEOUT)
-
-            failed = [(rank, p) for rank, p in enumerate(processes) if p.exitcode != 0]
-            for rank, p in failed:
-                # If it still hasn't terminated, kill it because it hung.
-                if p.exitcode is None:
-                    p.terminate()
-                    pytest.fail(f'Worker {rank} hung.', pytrace=False)
-                if p.exitcode < 0:
-                    pytest.fail(f'Worker {rank} killed by signal {-p.exitcode}',
-                                pytrace=False)
-                if p.exitcode > 0:
-                    pytest.fail(f'Worker {rank} exited with code {p.exitcode}',
-                                pytrace=False)
-
-        def run_func_decorator(*func_args, **func_kwargs):
-            """Entry point for @distributed_test(). """
-
-            if isinstance(world_size, int):
-                dist_launcher(world_size, *func_args, **func_kwargs)
-            elif isinstance(world_size, list):
-                for procs in world_size:
-                    dist_launcher(procs, *func_args, **func_kwargs)
-                    time.sleep(0.5)
-            else:
-                raise TypeError(f'world_size must be an integer or a list of integers.')
+            dist.destroy_process_group()
+
+
+class DistributedFixture(DistributedExec):
+    """
+    Implementation that extends @pytest.fixture to allow for distributed execution.
+    This is primarily meant to be used when a test requires executing two pieces of
+    code with different world sizes.
+
+    There are 2 parameters that can be modified:
+        - world_size: int = 2 -- the number of processes to launch
+        - backend: Literal['nccl','mpi','gloo'] = 'nccl' -- which backend to use
+
+    Features:
+        - able to call pytest.skip() inside fixture
+        - can be reused by multiple tests
+        - can accept other fixtures as input
+
+    Limitations:
+        - cannot use @pytest.mark.parametrize
+        - world_size cannot be modified after definition and only one world_size value is accepted
+        - any fixtures used must also be used in the test that uses this fixture (see example below)
+        - return values cannot be returned. Passing values to a DistributedTest
+          object can be achieved using class_tmpdir and writing to file (see example below)
+
+    Usage:
+        - must implement a run(self, ...) method
+        - fixture can be used by making the class name input to a test function
+
+    Example:
+        @pytest.fixture(params=[10,20])
+        def regular_pytest_fixture(request):
+            return request.param
+
+        class distributed_fixture_example(DistributedFixture):
+            world_size = 4
+
+            def run(self, regular_pytest_fixture, class_tmpdir):
+                assert int(os.environ["WORLD_SIZE"]) == self.world_size
+                local_rank = os.environ["LOCAL_RANK"]
+                print(f"Rank {local_rank} with value {regular_pytest_fixture}")
+                with open(os.path.join(class_tmpdir, f"{local_rank}.txt"), "w") as f:
+                    f.write(f"{local_rank},{regular_pytest_fixture}")
+
+        class TestExample(DistributedTest):
+            world_size = 1
+
+            def test(self, distributed_fixture_example, regular_pytest_fixture, class_tmpdir):
+                assert int(os.environ["WORLD_SIZE"]) == self.world_size
+                for rank in range(4):
+                    with open(os.path.join(class_tmpdir, f"{rank}.txt"), "r") as f:
+                        assert f.read() == f"{rank},{regular_pytest_fixture}"
+    """
+    is_dist_fixture = True
+
+    # These values are just placeholders so that pytest recognizes this as a fixture
+    _pytestfixturefunction = FixtureFunctionMarker(scope="function", params=None)
+    __name__ = ""
+
+    def __init__(self):
+        assert isinstance(self.world_size, int), "Only one world size is allowed for distributed fixtures"
+        self.__name__ = type(self).__name__
+        _pytestfixturefunction = FixtureFunctionMarker(scope="function",
+                                                       params=None,
+                                                       name=self.__name__)
 
-        return run_func_decorator
 
-    return dist_wrap
+class DistributedTest(DistributedExec):
+    """
+    Implementation for running pytest with distributed execution.
+
+    There are 2 parameters that can be modified:
+        - world_size: Union[int,List[int]] = 2 -- the number of processes to launch
+        - backend: Literal['nccl','mpi','gloo'] = 'nccl' -- which backend to use
+
+    Features:
+        - able to call pytest.skip() inside tests
+        - works with pytest fixtures, parametrize, mark, etc.
+        - can contain multiple tests (each of which can be parametrized separately)
+        - class methods can be fixtures (usable by tests in this class only)
+        - world_size can be changed for individual tests using @pytest.mark.world_size(world_size)
+        - class_tmpdir is a fixture that can be used to get a tmpdir shared among
+          all tests (including DistributedFixture)
+
+    Usage:
+        - class name must start with "Test"
+        - must implement one or more test*(self, ...) methods
+
+    Example:
+        @pytest.fixture(params=[10,20])
+        def val1(request):
+            return request.param
+
+        @pytest.mark.fast
+        @pytest.mark.parametrize("val2", [30,40])
+        class TestExample(DistributedTest):
+            world_size = 2
+
+            @pytest.fixture(params=[50,60])
+            def val3(self, request):
+                return request.param
+
+            def test_1(self, val1, val2, str1="hello world"):
+                assert int(os.environ["WORLD_SIZE"]) == self.world_size
+                assert all(val1, val2, str1)
+
+            @pytest.mark.world_size(1)
+            @pytest.mark.parametrize("val4", [70,80])
+            def test_2(self, val1, val2, val3, val4):
+                assert int(os.environ["WORLD_SIZE"]) == 1
+                assert all(val1, val2, val3, val4)
+    """
+    is_dist_test = True
+
+    # Temporary directory that is shared among test methods in a class
+    @pytest.fixture(autouse=True, scope="class")
+    def class_tmpdir(self, tmpdir_factory):
+        fn = tmpdir_factory.mktemp(self.__class__.__name__)
+        return fn
+
+    def run(self, **fixture_kwargs):
+        self._current_test(**fixture_kwargs)
+
+    def __call__(self, request):
+        self._current_test = self._get_current_test_func(request)
+        self._fixture_kwargs = self._get_fixture_kwargs(request, self._current_test)
+
+        if self.requires_cuda_env and not get_accelerator().is_available():
+            pytest.skip("only supported in accelerator environments.")
+
+        # Catch world_size override pytest mark
+        for mark in getattr(request.function, "pytestmark", []):
+            if mark.name == "world_size":
+                world_size = mark.args[0]
+                break
+        else:
+            world_size = self.world_size
+
+        if isinstance(world_size, int):
+            world_size = [world_size]
+        for procs in world_size:
+            self._launch_procs(procs)
+            time.sleep(0.5)
+
+    def _get_current_test_func(self, request):
+        # DistributedTest subclasses may have multiple test methods
+        func_name = request.function.__name__
+        return getattr(self, func_name)
 
 
 def get_test_path(filename):
diff --git a/tests/unit/compression/test_compression.py b/tests/unit/compression/test_compression.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d1f02e771cbdc486803786e7b3f31b4e95ad6f9
--- /dev/null
+++ b/tests/unit/compression/test_compression.py
@@ -0,0 +1,268 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import pytest
+import random
+import numpy as np
+from unit.megatron_model import get_gpt2_model
+from deepspeed.compression.compress import init_compression
+from unit.modeling import BertConfig
+from unit.modelingpreln import BertEncoder as BertEncoderPreln
+from deepspeed.compression.basic_layer import LinearLayer_Compress, ColumnParallelLinear_Compress, RowParallelLinear_Compress
+from deepspeed.compression.helper import convert_conv1d_to_linear
+from deepspeed.accelerator import get_accelerator
+from unit.common import DistributedTest
+
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+pytestmark = pytest.mark.skipif(
+    TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 5),
+    reason='Megatron-LM package requires Pytorch version 1.5 or above')
+
+
+def reset_random(seed=1234):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    get_accelerator().manual_seed_all(seed)
+
+
+def create_bert_model():
+    hidden_size = 384
+    num_layers = 2
+    heads = 12
+    dropout_ratio = 0.1
+    bert_config = BertConfig(vocab_size_or_config_json_file=119547,
+                             hidden_size=hidden_size,
+                             num_hidden_layers=num_layers,
+                             num_attention_heads=heads,
+                             intermediate_size=hidden_size * 4,
+                             hidden_act="gelu",
+                             hidden_dropout_prob=dropout_ratio,
+                             attention_probs_dropout_prob=dropout_ratio,
+                             max_position_embeddings=512,
+                             type_vocab_size=2,
+                             initializer_range=0.2)
+
+    weights = []
+    biases = []
+
+    for i in range(4):
+        weights.append(torch.nn.Parameter(torch.Tensor(hidden_size, hidden_size)))
+
+    weights.append(torch.nn.Parameter(torch.Tensor(hidden_size)))
+    weights.append(torch.nn.Parameter(torch.Tensor(hidden_size * 4, hidden_size)))
+    weights.append(torch.nn.Parameter(torch.Tensor(hidden_size, hidden_size * 4)))
+    weights.append(torch.nn.Parameter(torch.Tensor(hidden_size)))
+
+    biases.append(torch.nn.Parameter(torch.Tensor(hidden_size)))
+    for i in range(4):
+        biases.append(torch.nn.Parameter(torch.Tensor(hidden_size)))
+    biases.append(torch.nn.Parameter(torch.Tensor(hidden_size * 4)))
+    biases.append(torch.nn.Parameter(torch.Tensor(hidden_size)))
+    biases.append(torch.nn.Parameter(torch.Tensor(hidden_size)))
+
+    return BertEncoderPreln(bert_config, weights, biases)
+
+
+class Conv1D(torch.nn.Module):
+    """
+    1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
+    Basically works like a linear layer but the weights are transposed.
+    Args:
+        nf (`int`): The number of output features.
+        nx (`int`): The number of input features.
+    """
+    def __init__(self, nf, nx):
+        super().__init__()
+        self.nf = nf
+        w = torch.empty(nx, nf)
+        self.weight = torch.nn.Parameter(w)
+        self.bias = torch.nn.Parameter(torch.zeros(nf))
+
+    def forward(self, x):
+        size_out = x.size()[:-1] + (self.nf, )
+        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
+        x = x.view(size_out)
+        return x
+
+
+def create_conv1d_model():
+    nf = 128
+    nx = 128
+
+    return torch.nn.ModuleList([Conv1D(nf, nx) for i in range(4)])
+
+
+class TestCompression(DistributedTest):
+    def setup_method(self, method):
+        reset_random()
+
+    def get_ds_config(self):
+        ds_config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "optimizer": {
+                "type": "Lamb",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "fp16": {
+                "enabled": True
+            },
+            "compression_training": {
+                "weight_quantization": {
+                    "shared_parameters": {
+                        "enabled": True,
+                        "quantizer_kernel": False,
+                        "schedule_offset": 50,
+                        "quantize_groups": 1,
+                        "quantize_verbose": False,
+                        "quantization_type": "asymmetric",
+                        "rounding": "nearest",
+                        "fp16_mixed_quantize": {
+                            "enabled": False,
+                            "quantize_change_ratio": 0.001
+                        }
+                    },
+                    "different_groups": {
+                        "wq1": {
+                            "params": {
+                                "start_bits": 12,
+                                "target_bits": 8,
+                                "quantization_period": 50
+                            },
+                            "modules": ["attention.self",
+                                        "intermediate"]
+                        },
+                        "wq2": {
+                            "params": {
+                                "start_bits": 12,
+                                "target_bits": 4,
+                                "quantization_period": 50
+                            },
+                            "modules": ["attention.output"]
+                        }
+                    }
+                },
+                "activation_quantization": {
+                    "shared_parameters": {
+                        "enabled": True,
+                        "quantization_type": "asymmetric",
+                        "range_calibration": "dynamic",
+                        "schedule_offset": 50
+                    },
+                    "different_groups": {
+                        "aq1": {
+                            "params": {
+                                "bits": 8
+                            },
+                            "modules": ["attention.output"]
+                        }
+                    }
+                },
+                "sparse_pruning": {
+                    "shared_parameters": {
+                        "enabled": True,
+                        "schedule_offset": 30,
+                        "method": "l1"
+                    },
+                    "different_groups": {
+                        "sp1": {
+                            "params": {
+                                "dense_ratio": 0.5
+                            },
+                            "modules": ["attention.self"]
+                        }
+                    }
+                },
+                "row_pruning": {
+                    "shared_parameters": {
+                        "enabled": True,
+                        "schedule_offset": 20,
+                        "method": "topk"
+                    },
+                    "different_groups": {
+                        "rp1": {
+                            "params": {
+                                "dense_ratio": 0.5
+                            },
+                            "modules": ["intermediate.dense"],
+                            "related_modules": [["layer.\\w+.output.dense"]]
+                        }
+                    }
+                },
+                "head_pruning": {
+                    "shared_parameters": {
+                        "enabled": True,
+                        "schedule_offset": 10,
+                        "method": "topk",
+                        "num_heads": 12
+                    },
+                    "different_groups": {
+                        "rp1": {
+                            "params": {
+                                "dense_ratio": 0.5
+                            },
+                            "modules": ["attention.output.dense"],
+                            "related_modules": [["self.query",
+                                                 "self.key",
+                                                 "self.value"]]
+                        }
+                    }
+                }
+            }
+        }
+
+        return ds_config_dict
+
+    def test_linear_layer_compress(self, tmpdir):
+        model = create_bert_model()
+        compressed_model = init_compression(model, self.get_ds_config())
+
+        assert isinstance(compressed_model.layer[0].attention.self.query,
+                          LinearLayer_Compress)
+        assert isinstance(compressed_model.layer[0].attention.self.key,
+                          LinearLayer_Compress)
+        assert isinstance(compressed_model.layer[0].attention.self.value,
+                          LinearLayer_Compress)
+
+    def test_mpu_compress(self, tmpdir):
+        #from megatron import mpu
+        import sys
+        sys.path.append(r"/home/aishsh/megatron-lm")
+        from megatron import mpu
+        args_defaults = {
+            'num_layers': 2,
+            'hidden_size': 128,
+            'num_attention_heads': 8,
+            'max_position_embeddings': 128,
+        }
+
+        model = get_gpt2_model(args_defaults)
+        compressed_model = init_compression(model, self.get_ds_config(), mpu=mpu)
+
+        assert isinstance(
+            compressed_model.module.language_model.transformer.layers[0].attention.
+            query_key_value,
+            ColumnParallelLinear_Compress)
+        assert isinstance(
+            compressed_model.module.language_model.transformer.layers[0].attention.dense,
+            RowParallelLinear_Compress)
+        assert isinstance(
+            compressed_model.module.language_model.transformer.layers[0].mlp.
+            dense_h_to_4h,
+            ColumnParallelLinear_Compress)
+        assert isinstance(
+            compressed_model.module.language_model.transformer.layers[0].mlp.
+            dense_4h_to_h,
+            RowParallelLinear_Compress)
+
+    def test_conv1d_convertion(self, tmpdir):
+        model = create_conv1d_model()
+        compressed_model = convert_conv1d_to_linear(model, Conv1D)
+
+        assert isinstance(compressed_model[0], torch.nn.Linear)
+        assert isinstance(compressed_model[1], torch.nn.Linear)
+        assert isinstance(compressed_model[2], torch.nn.Linear)
+        assert isinstance(compressed_model[3], torch.nn.Linear)
diff --git a/tests/unit/elasticity/test_elastic.py b/tests/unit/elasticity/test_elastic.py
new file mode 100644
index 0000000000000000000000000000000000000000..e29b2a22e825b900ccb86dcd818da781ac185b41
--- /dev/null
+++ b/tests/unit/elasticity/test_elastic.py
@@ -0,0 +1,292 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import pytest
+import deepspeed
+from unit.common import DistributedTest
+from deepspeed.git_version_info import version as ds_version
+import os
+from unit.simple_model import SimpleModel
+
+
+@pytest.fixture
+def ds_config():
+    config_dict = {
+        "elasticity": {
+            "enabled": True,
+            "max_train_batch_size": 10000,
+            "micro_batch_sizes": [8,
+                                  12,
+                                  16,
+                                  17],
+            "min_gpus": 32,
+            "max_gpus": 1500,
+            "min_time": 20,
+            "version": 0.1
+        }
+    }
+    return config_dict
+
+
+def test_basic_10k(ds_config):
+    final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(
+        ds_config=ds_config,
+        target_deepspeed_version=ds_version)
+
+    for gpu_num in valid_gpus:
+        assert final_batch_size % gpu_num == 0, f"Batch {final_batch_size} is not divisible by GPU count {gpu_num}"
+        batch_per_gpu = final_batch_size // gpu_num
+        found_valid_mbsize = False
+
+        for mb in ds_config['elasticity']['micro_batch_sizes']:
+            if batch_per_gpu % mb == 0:
+                found_valid_mb = True
+                break
+        assert found_valid_mb, "No valid mb found"
+
+    assert len(valid_gpus) == 23
+    assert final_batch_size == 9792
+
+
+def test_old_version(ds_config):
+    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
+        final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(
+            ds_config=ds_config,
+            target_deepspeed_version="0.2")
+
+
+def test_disabled(ds_config):
+    ds_config['elasticity']['enabled'] = False
+    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
+        final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(
+            ds_config=ds_config,
+            target_deepspeed_version=ds_version)
+
+
+def test_valid_world_size(ds_config):
+    final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
+            ds_config=ds_config,
+            target_deepspeed_version=ds_version,
+            world_size=64)
+    assert mbsize == 17
+
+
+def test_invalid_world_size(ds_config):
+    with pytest.raises(deepspeed.elasticity.config.ElasticityIncompatibleWorldSize):
+        final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
+            ds_config=ds_config,
+            target_deepspeed_version=ds_version,
+            world_size=128)
+
+
+def test_future_elastic_version(ds_config):
+    ds_config['elasticity']['version'] = '0.3'
+    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
+                                                    target_deepspeed_version=ds_version)
+
+
+def test_missing_max_batch(ds_config):
+    del ds_config['elasticity']['max_train_batch_size']
+    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
+                                                    target_deepspeed_version=ds_version)
+
+
+def test_missing_micro_batch(ds_config):
+    del ds_config['elasticity']['micro_batch_sizes']
+    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
+                                                    target_deepspeed_version=ds_version)
+
+
+def test_empty_config():
+    ds_config = {"elasticity": {"enabled": True}}
+    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
+                                                    target_deepspeed_version=ds_version)
+
+
+def test_model_parallel_v1_invalid(ds_config):
+    ds_config["elasticity"]["model_parallel_size"] = 4
+    ds_config["elasticity"]["num_gpus_per_node"] = 8
+    ds_config["elasticity"]["version"] = 0.1
+
+    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
+                                                    target_deepspeed_version=ds_version)
+
+
+def test_model_parallel_v2_invalid(ds_config):
+    ds_config["elasticity"]["model_parallel_size"] = 16
+    ds_config["elasticity"]["num_gpus_per_node"] = 8
+    ds_config["elasticity"]["version"] = 0.2
+
+    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
+                                                    target_deepspeed_version=ds_version,
+                                                    world_size=16)
+
+
+def test_model_parallel_v2_valid(ds_config):
+    ds_config["elasticity"]["model_parallel_size"] = 4
+    ds_config["elasticity"]["num_gpus_per_node"] = 8
+    ds_config["elasticity"]["version"] = 0.2
+
+    os.environ["WORLD_SIZE"] = str(16)
+    deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
+                                                target_deepspeed_version=ds_version)
+    os.environ.pop("WORLD_SIZE")
+
+
+@pytest.mark.parametrize('key, value',
+                         [('micro_batch_sizes',
+                           [1,
+                            4,
+                            -1,
+                            2,
+                            -10]),
+                          ('min_gpus',
+                           -1),
+                          ('max_gpus',
+                           -1),
+                          ('micro_batch_sizes',
+                           5),
+                          ('micro_batch_sizes',
+                           ['a',
+                            None,
+                            0.5]),
+                          ('micro_batch_sizes',
+                           [2,
+                            0.5,
+                            4])])
+def test_invalid_config_values(key, value, ds_config):
+    ds_config['elasticity'][key] = value
+    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
+                                                    target_deepspeed_version=ds_version)
+
+
+def test_proper_mbsz(ds_config):
+    ds_config["elasticity"]["max_train_batch_size"] = 32
+    ds_config["elasticity"]["micro_batch_sizes"] = [1, 2, 3, 7]
+    ds_config["elasticity"]["min_gpus"] = 1
+    final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
+        ds_config=ds_config,
+        target_deepspeed_version=ds_version,
+        world_size=7)
+    assert mbsize == 3
+
+
+class TestNonElasticBatchParams(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Lamb",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "gradient_clipping": 1.0,
+            "elasticity": {
+                "enabled": True,
+                "max_train_batch_size": 4,
+                "micro_batch_sizes": [1,
+                                      2,
+                                      3,
+                                      4],
+                "min_gpus": 1,
+                "max_gpus": 4,
+                "min_time": 20,
+                "version": 0.1
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim, empty_grad=False)
+
+        with pytest.raises(deepspeed.elasticity.config.ElasticityError):
+            model, _, _,_ = deepspeed.initialize(config=config_dict,
+                                                 model=model,
+                                                 model_parameters=model.parameters())
+
+
+class TestNonElasticBatchParamsWithOverride(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Lamb",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "gradient_clipping": 1.0,
+            "elasticity": {
+                "enabled": True,
+                "max_train_batch_size": 4,
+                "micro_batch_sizes": [1,
+                                      2,
+                                      3,
+                                      4],
+                "min_gpus": 1,
+                "max_gpus": 4,
+                "min_time": 20,
+                "version": 0.1,
+                "ignore_non_elastic_batch_info": True
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim, empty_grad=False)
+        model, _, _,_ = deepspeed.initialize(config=config_dict,
+                                             model=model,
+                                             model_parameters=model.parameters())
+
+
+class TestElasticConfigChanged(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Lamb",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "gradient_clipping": 1.0,
+            "elasticity": {
+                "enabled": True,
+                "max_train_batch_size": 4,
+                "micro_batch_sizes": [1,
+                                      2,
+                                      3,
+                                      4],
+                "min_gpus": 1,
+                "max_gpus": 4,
+                "min_time": 20,
+                "version": 0.1,
+                "ignore_non_elastic_batch_info": True
+            }
+        }
+        import json, os
+        scheduler_elastic_config = config_dict.copy()
+        scheduler_elastic_config["elasticity"]["max_train_batch_size"] = 27
+        os.environ['DEEPSPEED_ELASTICITY_CONFIG'] = json.dumps(scheduler_elastic_config)
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim, empty_grad=False)
+
+        with pytest.raises(deepspeed.elasticity.config.ElasticityError):
+            model, _, _,_ = deepspeed.initialize(config=config_dict,
+                                                 model=model,
+                                                 model_parameters=model.parameters())
diff --git a/tests/unit/launcher/test_ds_arguments.py b/tests/unit/launcher/test_ds_arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d7af74f2c57706d547c478f7e1e4ea2e10dc6ee
--- /dev/null
+++ b/tests/unit/launcher/test_ds_arguments.py
@@ -0,0 +1,102 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import argparse
+import pytest
+import deepspeed
+
+
+def basic_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num_epochs', type=int)
+    return parser
+
+
+def test_no_ds_arguments_no_ds_parser():
+    parser = basic_parser()
+    args = parser.parse_args(['--num_epochs', '2'])
+    assert args
+
+    assert hasattr(args, 'num_epochs')
+    assert args.num_epochs == 2
+
+    assert not hasattr(args, 'deepspeed')
+    assert not hasattr(args, 'deepspeed_config')
+
+
+def test_no_ds_arguments():
+    parser = basic_parser()
+    parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args(['--num_epochs', '2'])
+    assert args
+
+    assert hasattr(args, 'num_epochs')
+    assert args.num_epochs == 2
+
+    assert hasattr(args, 'deepspeed')
+    assert args.deepspeed == False
+
+    assert hasattr(args, 'deepspeed_config')
+    assert args.deepspeed_config == None
+
+
+def test_no_ds_enable_argument():
+    parser = basic_parser()
+    parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args(['--num_epochs', '2', '--deepspeed_config', 'foo.json'])
+    assert args
+
+    assert hasattr(args, 'num_epochs')
+    assert args.num_epochs == 2
+
+    assert hasattr(args, 'deepspeed')
+    assert args.deepspeed == False
+
+    assert hasattr(args, 'deepspeed_config')
+    assert type(args.deepspeed_config) == str
+    assert args.deepspeed_config == 'foo.json'
+
+
+def test_no_ds_config_argument():
+    parser = basic_parser()
+    parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args(['--num_epochs', '2', '--deepspeed'])
+    assert args
+
+    assert hasattr(args, 'num_epochs')
+    assert args.num_epochs == 2
+
+    assert hasattr(args, 'deepspeed')
+    assert type(args.deepspeed) == bool
+    assert args.deepspeed == True
+
+    assert hasattr(args, 'deepspeed_config')
+    assert args.deepspeed_config == None
+
+
+def test_no_ds_parser():
+    parser = basic_parser()
+    with pytest.raises(SystemExit):
+        args = parser.parse_args(['--num_epochs', '2', '--deepspeed'])
+
+
+def test_core_deepscale_arguments():
+    parser = basic_parser()
+    parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args(
+        ['--num_epochs',
+         '2',
+         '--deepspeed',
+         '--deepspeed_config',
+         'foo.json'])
+    assert args
+
+    assert hasattr(args, 'num_epochs')
+    assert args.num_epochs == 2
+
+    assert hasattr(args, 'deepspeed')
+    assert type(args.deepspeed) == bool
+    assert args.deepspeed == True
+
+    assert hasattr(args, 'deepspeed_config')
+    assert type(args.deepspeed_config) == str
+    assert args.deepspeed_config == 'foo.json'
diff --git a/tests/unit/launcher/test_multinode_runner.py b/tests/unit/launcher/test_multinode_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..27233d7150dcadf6720f8b93aa9afe140dfcedce
--- /dev/null
+++ b/tests/unit/launcher/test_multinode_runner.py
@@ -0,0 +1,52 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from copy import deepcopy
+from deepspeed.launcher import multinode_runner as mnrunner
+from deepspeed.launcher.runner import encode_world_info, parse_args
+import os
+import pytest
+
+
+@pytest.fixture
+def runner_info():
+    hosts = {'worker-0': 4, 'worker-1': 4}
+    world_info = encode_world_info(hosts)
+    env = deepcopy(os.environ)
+    args = parse_args(['test_launcher.py'])
+    return env, hosts, world_info, args
+
+
+def test_pdsh_runner(runner_info):
+    env, resource_pool, world_info, args = runner_info
+    runner = mnrunner.PDSHRunner(args, world_info)
+    cmd, kill_cmd = runner.get_cmd(env, resource_pool)
+    assert cmd[0] == 'pdsh'
+    assert env['PDSH_RCMD_TYPE'] == 'ssh'
+
+
+def test_openmpi_runner(runner_info):
+    env, resource_pool, world_info, args = runner_info
+    runner = mnrunner.OpenMPIRunner(args, world_info, resource_pool)
+    cmd = runner.get_cmd(env, resource_pool)
+    assert cmd[0] == 'mpirun'
+
+
+def test_mpich_runner(runner_info):
+    env, resource_pool, world_info, args = runner_info
+    runner = mnrunner.MPICHRunner(args, world_info, resource_pool)
+    cmd = runner.get_cmd(env, resource_pool)
+    assert cmd[0] == 'mpirun'
+
+
+def test_slurm_runner(runner_info):
+    env, resource_pool, world_info, args = runner_info
+    runner = mnrunner.SlurmRunner(args, world_info, resource_pool)
+    cmd = runner.get_cmd(env, resource_pool)
+    assert cmd[0] == 'srun'
+
+
+def test_mvapich_runner(runner_info):
+    env, resource_pool, world_info, args = runner_info
+    runner = mnrunner.MVAPICHRunner(args, world_info, resource_pool)
+    cmd = runner.get_cmd(env, resource_pool)
+    assert cmd[0] == 'mpirun'
diff --git a/tests/unit/launcher/test_run.py b/tests/unit/launcher/test_run.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d7f4efc6815e83d513034ed86cd2e5ad349b15e
--- /dev/null
+++ b/tests/unit/launcher/test_run.py
@@ -0,0 +1,177 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import pytest
+
+from deepspeed.launcher import runner as dsrun
+
+
+def test_parser_mutual_exclusive():
+    '''Ensure dsrun.parse_resource_filter() raises a ValueError when include_str and
+    exclude_str are both provided.
+    '''
+    with pytest.raises(ValueError):
+        dsrun.parse_resource_filter({}, include_str='A', exclude_str='B')
+
+
+def test_parser_local():
+    ''' Test cases with only one node. '''
+    # First try no include/exclude
+    hosts = {'worker-0': [0, 1, 2, 3]}
+    ret = dsrun.parse_resource_filter(hosts)
+    assert (ret == hosts)
+
+    # exclude slots
+    ret = dsrun.parse_resource_filter(hosts, exclude_str='worker-0:1')
+    assert (ret == {'worker-0': [0, 2, 3]})
+
+    ret = dsrun.parse_resource_filter(hosts, exclude_str='worker-0:1,2')
+    assert (ret == {'worker-0': [0, 3]})
+
+    # only use one slot
+    ret = dsrun.parse_resource_filter(hosts, include_str='worker-0:1')
+    assert (ret == {'worker-0': [1]})
+
+    # including slots multiple times shouldn't break things
+    ret = dsrun.parse_resource_filter(hosts, include_str='worker-0:1,1')
+    assert (ret == {'worker-0': [1]})
+    ret = dsrun.parse_resource_filter(hosts, include_str='worker-0:1@worker-0:0,1')
+    assert (ret == {'worker-0': [0, 1]})
+
+    # including just 'worker-0' without : should still use all GPUs
+    ret = dsrun.parse_resource_filter(hosts, include_str='worker-0')
+    assert (ret == hosts)
+
+    # excluding just 'worker-0' without : should eliminate everything
+    ret = dsrun.parse_resource_filter(hosts, exclude_str='worker-0')
+    assert (ret == {})
+
+    # exclude all slots manually
+    ret = dsrun.parse_resource_filter(hosts, exclude_str='worker-0:0,1,2,3')
+    assert (ret == {})
+
+
+def test_parser_multinode():
+    # First try no include/exclude
+    hosts = {'worker-0': [0, 1, 2, 3], 'worker-1': [0, 1, 2, 3]}
+    ret = dsrun.parse_resource_filter(hosts)
+    assert (ret == hosts)
+
+    # include a node
+    ret = dsrun.parse_resource_filter(hosts, include_str='worker-1:0,3')
+    assert (ret == {'worker-1': [0, 3]})
+
+    # exclude a node
+    ret = dsrun.parse_resource_filter(hosts, exclude_str='worker-1')
+    assert (ret == {'worker-0': [0, 1, 2, 3]})
+
+    # exclude part of each node
+    ret = dsrun.parse_resource_filter(hosts, exclude_str='worker-0:0,1@worker-1:3')
+    assert (ret == {'worker-0': [2, 3], 'worker-1': [0, 1, 2]})
+
+
+def test_parser_errors():
+    '''Ensure we catch errors. '''
+    hosts = {'worker-0': [0, 1, 2, 3], 'worker-1': [0, 1, 2, 3]}
+
+    # host does not exist
+    with pytest.raises(ValueError):
+        dsrun.parse_resource_filter(hosts, include_str='jeff')
+    with pytest.raises(ValueError):
+        dsrun.parse_resource_filter(hosts, exclude_str='jeff')
+
+    # slot does not exist
+    with pytest.raises(ValueError):
+        dsrun.parse_resource_filter(hosts, include_str='worker-1:4')
+    with pytest.raises(ValueError):
+        dsrun.parse_resource_filter(hosts, exclude_str='worker-1:4')
+
+    # formatting
+    with pytest.raises(ValueError):
+        dsrun.parse_resource_filter(hosts, exclude_str='worker-1@worker-0:1@5')
+
+
+def test_num_plus_parser():
+    ''' Ensure we catch errors relating to num_nodes/num_gpus + -i/-e being mutually exclusive'''
+
+    # inclusion
+    with pytest.raises(ValueError):
+        dsrun.main(args="--num_nodes 1 -i localhost foo.py".split())
+    with pytest.raises(ValueError):
+        dsrun.main(args="--num_nodes 1 --num_gpus 1 -i localhost foo.py".split())
+    with pytest.raises(ValueError):
+        dsrun.main(args="--num_gpus 1 -i localhost foo.py".split())
+
+    # exclusion
+    with pytest.raises(ValueError):
+        dsrun.main(args="--num_nodes 1 -e localhost foo.py".split())
+    with pytest.raises(ValueError):
+        dsrun.main(args="--num_nodes 1 --num_gpus 1 -e localhost foo.py".split())
+    with pytest.raises(ValueError):
+        dsrun.main(args="--num_gpus 1 -e localhost foo.py".split())
+
+
+def test_hostfile_good():
+    # good hostfile w. empty lines and comment
+    hostfile = """
+    worker-1 slots=2
+    worker-2 slots=2
+
+    localhost slots=1
+    123.23.12.10 slots=2
+
+    #worker-1 slots=3
+    # this is a comment
+
+    """
+    r = dsrun._parse_hostfile(hostfile.splitlines())
+    assert "worker-1" in r
+    assert "worker-2" in r
+    assert "localhost" in r
+    assert "123.23.12.10" in r
+    assert r["worker-1"] == 2
+    assert r["worker-2"] == 2
+    assert r["localhost"] == 1
+    assert r["123.23.12.10"] == 2
+    assert len(r) == 4
+
+
+def test_hostfiles_bad():
+    # duplicate host
+    hostfile = """
+    worker-1 slots=2
+    worker-2 slots=1
+    worker-1 slots=1
+    """
+    with pytest.raises(ValueError):
+        dsrun._parse_hostfile(hostfile.splitlines())
+
+    # incorrect whitespace
+    hostfile = """
+    this is bad slots=1
+    """
+    with pytest.raises(ValueError):
+        dsrun._parse_hostfile(hostfile.splitlines())
+
+    # no whitespace
+    hostfile = """
+    missingslots
+    """
+    with pytest.raises(ValueError):
+        dsrun._parse_hostfile(hostfile.splitlines())
+
+    # empty
+    hostfile = """
+    """
+    with pytest.raises(ValueError):
+        dsrun._parse_hostfile(hostfile.splitlines())
+
+    # mix of good/bad
+    hostfile = """
+    worker-1 slots=2
+    this is bad slots=1
+    worker-2 slots=4
+    missingslots
+
+    """
+    with pytest.raises(ValueError):
+        dsrun._parse_hostfile(hostfile.splitlines())
diff --git a/tests/unit/megatron_model.py b/tests/unit/megatron_model.py
index fd2ef69b72595d3ccd64caca6b4fc60980efcc27..32faf224494027dd1f58fbe2ec6273d072e26657 100644
--- a/tests/unit/megatron_model.py
+++ b/tests/unit/megatron_model.py
@@ -1,4 +1,5 @@
-from pathlib import Path
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import torch
 import os
 import sys
@@ -6,6 +7,7 @@ import math
 
 from .common import get_test_path
 from deepspeed.pipe import PipelineModule, LayerSpec
+from deepspeed.accelerator import get_accelerator
 
 
 def get_megatron_version():
@@ -38,10 +40,10 @@ def get_gpt2_model(args_others, mp_size=1):
 
     initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True)
     model = GPT2Model(num_tokentypes=0, parallel_output=False)
-    model.cuda()
+    model.to(get_accelerator().device_name())
     from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
     from megatron import mpu
-    i = torch.cuda.current_device()
+    i = get_accelerator().current_device_name()
     model = torchDDP(model,
                      device_ids=[i],
                      output_device=i,
@@ -77,8 +79,9 @@ class MockGPT2ModelPipe(PipelineModule):
         class ParallelTransformerLayerPipe(ParallelTransformerLayer):
             def forward(self, args):
                 # hardcode attn mask for testing, PP requires the attn_mask to be stashed
-                attention_mask = torch.tensor([[True]],
-                                              device=torch.cuda.current_device())
+                attention_mask = torch.tensor(
+                    [[True]],
+                    device=get_accelerator().current_device_name())
                 return super().forward(args, attention_mask)
 
         layers = []
diff --git a/tests/unit/model_parallelism/test_configurable_parallel_mp.py b/tests/unit/model_parallelism/test_configurable_parallel_mp.py
new file mode 100644
index 0000000000000000000000000000000000000000..d17f45c0b526afc11042936beee5ce90021a550f
--- /dev/null
+++ b/tests/unit/model_parallelism/test_configurable_parallel_mp.py
@@ -0,0 +1,188 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
+import torch
+import deepspeed
+import pytest
+import random
+import numpy as np
+import deepspeed.comm as dist
+from deepspeed.accelerator import get_accelerator
+from unit.common import DistributedTest, DistributedFixture
+from unit.megatron_model import get_gpt2_model, get_megatron_version
+
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+pytestmark = pytest.mark.skipif(
+    TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 5),
+    reason='Megatron-LM package requires Pytorch version 1.5 or above')
+
+
+def get_deepspeed_model(model):
+    ds_config_dict = {
+        "train_micro_batch_size_per_gpu": 1,
+        "optimizer": {
+            "type": "Lamb",
+            "params": {
+                "lr": 0.00015
+            }
+        },
+    }
+
+    from megatron import mpu
+    model, _, _,_ = deepspeed.initialize(model=model,
+                                         mpu=mpu,
+                                         model_parameters=model.parameters(),
+                                         config=ds_config_dict)
+    return model
+
+
+class ConfigurableMP(DistributedTest):
+    @pytest.fixture(autouse=True)
+    def reset_random(self, seed=1234):
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        get_accelerator().manual_seed_all(seed)
+
+    @pytest.fixture
+    def inputs(self, bs=1, seq_len=20):
+        input_ids = torch.randint(low=0, high=1000, size=(bs, seq_len))
+        position_ids = torch.randint(low=0, high=2, size=(bs, seq_len))
+        attention_mask = torch.randint(low=0,
+                                       high=2,
+                                       size=(bs,
+                                             seq_len),
+                                       dtype=torch.bool)
+        return [input_ids, position_ids, attention_mask]
+
+
+class TestConfigurableMP(ConfigurableMP):
+    @pytest.mark.world_size(1)
+    def test_gpt2_basic(self, tmpdir, inputs):
+        args_defaults = {
+            'num_layers': 2,
+            'hidden_size': 128,
+            'num_attention_heads': 8,
+            'max_position_embeddings': 128,
+        }
+
+        model = get_gpt2_model(args_defaults)
+        model = get_deepspeed_model(model)
+
+        model.eval()
+        device_name = get_accelerator().device_name()
+        baseline = model(inputs[0].to(device_name),
+                         inputs[1].to(device_name),
+                         inputs[2].to(device_name))
+
+        tag = 'mp_1'
+        state_dict = {}
+        state_dict['checkpoint_version'] = get_megatron_version()
+        model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict)
+        dist.barrier()
+        model.load_checkpoint(tmpdir,
+                              tag=tag,
+                              load_optimizer_states=False,
+                              load_lr_scheduler_states=False)
+
+        test = model(inputs[0], inputs[1], inputs[2])
+        assert torch.allclose(baseline, test, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
+
+    @pytest.mark.world_size(2)
+    def test_gpt2_mp2_no_resize(self, tmpdir, inputs):
+        args_defaults = {
+            'num_layers': 2,
+            'hidden_size': 128,
+            'num_attention_heads': 8,
+            'max_position_embeddings': 128,
+        }
+
+        model = get_gpt2_model(args_defaults, mp_size=2)
+        model = get_deepspeed_model(model)
+
+        model.eval()
+
+        device_name = get_accelerator().device_name()
+        baseline = model(inputs[0].to(device_name),
+                         inputs[1].to(device_name),
+                         inputs[2].to(device_name))
+
+        tag = 'mp_2'
+        state_dict = {}
+        state_dict['checkpoint_version'] = get_megatron_version()
+        model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict)
+        dist.barrier()
+        model.load_checkpoint(tmpdir,
+                              tag=tag,
+                              load_optimizer_states=False,
+                              load_lr_scheduler_states=False)
+
+        device_name = get_accelerator().device_name()
+        test = model(inputs[0].to(device_name),
+                     inputs[1].to(device_name),
+                     inputs[2].to(device_name))
+        assert torch.allclose(baseline, test, rtol=1.0, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
+
+
+# This fixture provides the baseline model with mp=2 to TestConfigurableMPResize
+class baseline_mp2(DistributedFixture):
+    world_size = 2
+
+    def run(self, inputs, class_tmpdir):
+        args_defaults = {
+            'num_layers': 2,
+            'hidden_size': 128,
+            'num_attention_heads': 8,
+            'max_position_embeddings': 128,
+        }
+
+        model = get_gpt2_model(args_defaults, mp_size=self.world_size)
+        model = get_deepspeed_model(model)
+
+        model.eval()
+
+        with torch.no_grad():
+            device_name = get_accelerator().device_name()
+            baseline = model(inputs[0].to(device_name),
+                             inputs[1].to(device_name),
+                             inputs[2].to(device_name))
+            if dist.get_rank() == 0:
+                save_path = os.path.join(class_tmpdir, "output.pt")
+                torch.save(baseline.cpu(), save_path)
+
+            state_dict = {}
+            state_dict['checkpoint_version'] = get_megatron_version()
+            model.save_checkpoint(class_tmpdir, client_state=state_dict)
+
+
+class TestConfigurableResizeMP(ConfigurableMP):
+    world_size = [1, 4]
+
+    def test(self, baseline_mp2, inputs, class_tmpdir):
+        args_defaults = {
+            'num_layers': 2,
+            'hidden_size': 128,
+            'num_attention_heads': 8,
+            'max_position_embeddings': 128,
+        }
+
+        world_size = os.environ["WORLD_SIZE"]
+        model = get_gpt2_model(args_defaults, mp_size=world_size)
+        model = get_deepspeed_model(model)
+
+        model.eval()
+
+        with torch.no_grad():
+            model.load_checkpoint(class_tmpdir,
+                                  load_optimizer_states=False,
+                                  load_lr_scheduler_states=False)
+            device_name = get_accelerator().device_name()
+            test = model(inputs[0].to(device_name),
+                         inputs[1].to(device_name),
+                         inputs[2].to(device_name))
+            if dist.get_rank() == 0:
+                load_path = os.path.join(class_tmpdir, "output.pt")
+                baseline = torch.load(load_path)
+                test = test.cpu()
+                assert torch.allclose(baseline, test, atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}"
diff --git a/tests/unit/model_parallelism/test_configurable_parallel_pp.py b/tests/unit/model_parallelism/test_configurable_parallel_pp.py
new file mode 100644
index 0000000000000000000000000000000000000000..af091d68c411a47cd915577b06f4dda5dd37b461
--- /dev/null
+++ b/tests/unit/model_parallelism/test_configurable_parallel_pp.py
@@ -0,0 +1,352 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
+import torch
+import deepspeed
+import pytest
+import random
+import numpy as np
+import deepspeed.comm as dist
+from unit.common import DistributedTest, DistributedFixture
+from unit.megatron_model import get_megatron_version
+from unit.megatron_model import MockGPT2ModelPipe as GPT2ModelPipe
+from deepspeed.utils import RepeatingLoader
+from deepspeed.accelerator import get_accelerator
+
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+pytestmark = pytest.mark.skipif(
+    TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 5),
+    reason='Megatron-LM package requires Pytorch version 1.5 or above')
+
+
+def get_deepspeed_model(model):
+    ds_config_dict = {
+        "train_micro_batch_size_per_gpu": 1,
+        "optimizer": {
+            "type": "Lamb",
+            "params": {
+                "lr": 0.00015
+            }
+        },
+    }
+
+    model, _, _,_ = deepspeed.initialize(model=model,
+                                         model_parameters=model.parameters(),
+                                         config=ds_config_dict)
+    return model.to(get_accelerator().device_name())
+
+
+def get_topology(mp, pp, world_size):
+    assert world_size % (pp * mp) == 0
+    dp = world_size // (pp * mp)
+
+    from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology
+    topo = PipeModelDataParallelTopology(num_pp=pp, num_mp=mp, num_dp=dp)
+
+    return topo
+
+
+class ConfigurablePP(DistributedTest):
+    @pytest.fixture(autouse=True)
+    def reset_random(self, seed=1234):
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        get_accelerator().manual_seed_all(seed)
+
+    @pytest.fixture
+    def inputs(self, bs=1, seq_len=1, hidden_size=128):
+        hidden_states = torch.randn(bs, seq_len, hidden_size)
+        attention_mask = torch.randint(low=0,
+                                       high=2,
+                                       size=(bs,
+                                             seq_len),
+                                       dtype=torch.bool)
+        return (hidden_states, attention_mask)
+
+
+class TestConfigurablePP(ConfigurablePP):
+    mp_size = 2
+    pp_size = 2
+    world_size = 4  # mp_size * pp_size
+
+    def test_pp_basic(self, inputs, tmpdir):
+        # basic test case, mp_size=2, pp_size=2, verify ckpt saving/loading.
+        args_defaults = {
+            'num_layers': 8,
+            'hidden_size': 128,
+            'num_attention_heads': 8,
+            'max_position_embeddings': 128,
+        }
+        mp_size = self.mp_size
+        pp_size = self.pp_size
+        world_size = self.world_size
+
+        topo = get_topology(mp_size, pp_size, world_size)
+        gpt2_pipe_model = GPT2ModelPipe(num_layers=8,
+                                        num_stages=pp_size,
+                                        mp_size=mp_size,
+                                        args_others=args_defaults,
+                                        topo=topo)
+        model = get_deepspeed_model(gpt2_pipe_model)
+
+        tag = 'pp_basic'
+        state_dict = {}
+        state_dict['checkpoint_version'] = get_megatron_version()
+        model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict)
+
+        if model.is_first_stage() or model.is_last_stage():
+            loader = RepeatingLoader([(inputs[0], 0)])
+            data_iter = iter(loader)
+        else:
+            data_iter = None
+
+        baseline = model.eval_batch(data_iter=data_iter,
+                                    compute_loss=False,
+                                    reduce_output=None)
+
+        dist.barrier()
+        model.load_checkpoint(tmpdir,
+                              tag=tag,
+                              load_optimizer_states=False,
+                              load_lr_scheduler_states=False)
+        dist.barrier()
+
+        test = model.eval_batch(data_iter=data_iter,
+                                compute_loss=False,
+                                reduce_output=None)
+
+        if test is not None:
+            assert len(baseline) == len(test)
+            # Compare outputs of each microbatch
+            for mb in range(len(baseline)):
+                for b, t in zip(baseline[mb], test[mb]):
+                    if b.is_floating_point():  # don't compare masks
+                        assert torch.allclose(b, t, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
+
+
+# Fixture for defining the checkpoint path since all tests in
+# TestConfigurableResizePP will use the same tmpdir
+@pytest.fixture
+def checkpoint_tag(mp_size, pp_size, mp_resize, pp_resize):
+    return f"{mp_size}-{pp_size}-{mp_resize}-{pp_resize}"
+
+
+# Base class for creating / saving model output for baseline models. This is
+# not meant to be used directly as a fixture to any classes
+class _baseline(DistributedFixture):
+    world_size = None
+
+    def run(self, inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size):
+        assert int(os.environ["WORLD_SIZE"]) == (pp_size * mp_size), "world size does not match provided pp_size and mp_size"
+        args_defaults = {
+            'num_layers': 8,
+            'hidden_size': 128,
+            'num_attention_heads': 8,
+            'max_position_embeddings': 128,
+        }
+
+        topo = get_topology(mp_size, pp_size, mp_size * pp_size)
+        gpt2_pipe_model = GPT2ModelPipe(num_layers=8,
+                                        num_stages=pp_size,
+                                        mp_size=mp_size,
+                                        args_others=args_defaults,
+                                        topo=topo)
+        model = get_deepspeed_model(gpt2_pipe_model)
+
+        with torch.no_grad():
+            inputs = [x.to(get_accelerator().device_name()) for x in inputs]
+            if model.is_first_stage() or model.is_last_stage():
+                loader = RepeatingLoader([(inputs[0], 0)])
+                data_iter = iter(loader)
+            else:
+                data_iter = None
+
+            baseline = model.eval_batch(data_iter=data_iter,
+                                        compute_loss=False,
+                                        reduce_output=None)
+
+            if baseline is not None:
+                # baseline should be [[hidden, True]]]
+                assert len(baseline) == 1
+                assert len(baseline[0]) == 1
+                assert torch.is_tensor(baseline[0][0])
+                save_path = os.path.join(class_tmpdir, f"output-{checkpoint_tag}.pt")
+                torch.save(baseline[0][0].cpu(), save_path)
+
+            state_dict = {}
+            state_dict['checkpoint_version'] = get_megatron_version()
+            model.save_checkpoint(class_tmpdir,
+                                  tag=checkpoint_tag,
+                                  client_state=state_dict)
+
+
+# This may look odd, but there is a limitation with DistributedFixture that
+# doesn't allow us to reuse a fixture with different worldsizes. This could be
+# implemented in conftest.py::pytest_fixture_setup and common.py::DistributedFixture
+class baseline_ws1(_baseline):
+    world_size = 1
+
+
+class baseline_ws2(_baseline):
+    world_size = 2
+
+
+class baseline_ws4(_baseline):
+    world_size = 4
+
+
+class TestConfigurableResizePP(ConfigurablePP):
+    def _test(self,
+              inputs,
+              class_tmpdir,
+              checkpoint_tag,
+              mp_size,
+              pp_size,
+              mp_resize,
+              pp_resize):
+        args_defaults = {
+            'num_layers': 8,
+            'hidden_size': 128,
+            'num_attention_heads': 8,
+            'max_position_embeddings': 128,
+        }
+
+        topo = get_topology(mp_resize, pp_resize, mp_resize * pp_resize)
+        gpt2_pipe_model = GPT2ModelPipe(num_layers=8,
+                                        num_stages=pp_resize,
+                                        mp_size=mp_resize,
+                                        args_others=args_defaults,
+                                        topo=topo)
+        model = get_deepspeed_model(gpt2_pipe_model)
+
+        with torch.no_grad():
+            model.load_checkpoint(class_tmpdir,
+                                  tag=checkpoint_tag,
+                                  load_optimizer_states=False,
+                                  load_lr_scheduler_states=False)
+            inputs = [x.to(get_accelerator().device_name()) for x in inputs]
+            if model.is_first_stage() or model.is_last_stage():
+                loader = RepeatingLoader([(inputs[0], 0)])
+                data_iter = iter(loader)
+            else:
+                data_iter = None
+
+            test = model.eval_batch(data_iter=data_iter,
+                                    compute_loss=False,
+                                    reduce_output=None)
+
+            if test is not None:
+                # test should be [[hidden, True]]]
+                assert len(test) == 1
+                assert len(test[0]) == 1
+                assert torch.is_tensor(test[0][0])
+                test = test[0][0].cpu()
+                load_path = os.path.join(class_tmpdir, f"output-{checkpoint_tag}.pt")
+                baseline = torch.load(load_path)
+                assert torch.allclose(baseline, test, atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}"
+
+    # These tests are divided by baseline model worldsize and test model worldsize
+    @pytest.mark.world_size(1)
+    @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(1, 2, 1, 1)])
+    def test_world_size_2to1(self,
+                             inputs,
+                             class_tmpdir,
+                             checkpoint_tag,
+                             baseline_ws2,
+                             mp_size,
+                             pp_size,
+                             mp_resize,
+                             pp_resize):
+        self._test(inputs,
+                   class_tmpdir,
+                   checkpoint_tag,
+                   mp_size,
+                   pp_size,
+                   mp_resize,
+                   pp_resize)
+
+    @pytest.mark.world_size(1)
+    @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(2, 2, 1, 1)])
+    def test_world_size_4to1(self,
+                             inputs,
+                             class_tmpdir,
+                             checkpoint_tag,
+                             baseline_ws4,
+                             mp_size,
+                             pp_size,
+                             mp_resize,
+                             pp_resize):
+        self._test(inputs,
+                   class_tmpdir,
+                   checkpoint_tag,
+                   mp_size,
+                   pp_size,
+                   mp_resize,
+                   pp_resize)
+
+    @pytest.mark.world_size(2)
+    @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(2, 2, 2, 1)])
+    def test_world_size_4to2(self,
+                             inputs,
+                             class_tmpdir,
+                             checkpoint_tag,
+                             baseline_ws4,
+                             mp_size,
+                             pp_size,
+                             mp_resize,
+                             pp_resize):
+        self._test(inputs,
+                   class_tmpdir,
+                   checkpoint_tag,
+                   mp_size,
+                   pp_size,
+                   mp_resize,
+                   pp_resize)
+
+    @pytest.mark.world_size(4)
+    @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(1, 1, 2, 2)])
+    def test_world_size_1to4(self,
+                             inputs,
+                             class_tmpdir,
+                             checkpoint_tag,
+                             baseline_ws1,
+                             mp_size,
+                             pp_size,
+                             mp_resize,
+                             pp_resize):
+        self._test(inputs,
+                   class_tmpdir,
+                   checkpoint_tag,
+                   mp_size,
+                   pp_size,
+                   mp_resize,
+                   pp_resize)
+
+    @pytest.mark.world_size(4)
+    @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize",
+                             [(1,
+                               2,
+                               1,
+                               4),
+                              (2,
+                               1,
+                               2,
+                               2)])
+    def test_world_size_2to4(self,
+                             inputs,
+                             class_tmpdir,
+                             checkpoint_tag,
+                             baseline_ws2,
+                             mp_size,
+                             pp_size,
+                             mp_resize,
+                             pp_resize):
+        self._test(inputs,
+                   class_tmpdir,
+                   checkpoint_tag,
+                   mp_size,
+                   pp_size,
+                   mp_resize,
+                   pp_resize)
diff --git a/tests/unit/modeling.py b/tests/unit/modeling.py
index 8bf2d6dba9da197bd9c306bf2f04c5c936adfbf3..50846b53265638eadacb22dbed2564c422a3b89e 100644
--- a/tests/unit/modeling.py
+++ b/tests/unit/modeling.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+# Copyright The Microsoft DeepSpeed Team
 # DeepSpeed note, code taken from commit 3d59216cec89a363649b4fe3d15295ba936ced0f
 # https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/modeling.py
 
@@ -18,8 +20,6 @@
 # limitations under the License.
 """PyTorch BERT model."""
 
-from __future__ import absolute_import, division, print_function, unicode_literals
-
 import copy
 import json
 import logging
@@ -28,24 +28,22 @@ import os
 import shutil
 import tarfile
 import tempfile
-import sys
 from io import open
 
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from torch.utils import checkpoint
-import torch.distributed as dist
+import deepspeed.comm as dist
 
 from torch.nn import Module
-from torch.nn.parameter import Parameter
 import torch.nn.functional as F
 import torch.nn.init as init
-import time
 
 #from numba import cuda
 
 #from deepspeed_cuda import DeepSpeedSoftmaxConfig, DeepSpeedSoftmax
+from deepspeed.accelerator import get_accelerator
 
 logger = logging.getLogger(__name__)
 
@@ -187,8 +185,8 @@ ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 class GPUTimer:
     def __init__(self):
         super().__init__()
-        self.start = cuda.event()
-        self.stop = cuda.event()
+        self.start = get_accelerator().Event()  # noqa: F821
+        self.stop = get_accelerator().Event()  # noqa: F821
 
     def record(self):
         self.start.record()
@@ -216,9 +214,7 @@ class LinearActivation(Module):
         self.out_features = out_features
         self.fused_gelu = False
         self.fused_tanh = False
-        if isinstance(act,
-                      str) or (sys.version_info[0] == 2 and isinstance(act,
-                                                                       unicode)):
+        if isinstance(act, str):
             if bias and act == 'gelu':
                 self.fused_gelu = True
             elif bias and act == 'tanh':
@@ -307,10 +303,7 @@ class BertConfig(object):
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
         """
-        if isinstance(vocab_size_or_config_json_file,
-                      str) or (sys.version_info[0] == 2
-                               and isinstance(vocab_size_or_config_json_file,
-                                              unicode)):
+        if isinstance(vocab_size_or_config_json_file, str):
             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
                 json_config = json.loads(reader.read())
             for key, value in json_config.items():
@@ -367,6 +360,9 @@ try:
     import apex.normalization
     #apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward')
     BertLayerNorm = apex.normalization.FusedLayerNorm
+#aiss debug
+    #from torch.nn.modules import LayerNorm as BertLayerNorm
+
 except ImportError:
     print(
         "Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex."
@@ -644,8 +640,8 @@ class BertEncoder(nn.Module):
 
     def get_modules(self, big_node, input):
         for mdl in big_node.named_children():
-            graph.append(mdl)
-            get_modules(self, mdl, input)
+            self.graph.append(mdl)
+            self.get_modules(self, mdl, input)
 
     def forward(self,
                 hidden_states,
@@ -757,12 +753,12 @@ class BertLMPredictionHead(nn.Module):
 
     def forward(self, hidden_states):
         hidden_states = self.transform(hidden_states)
-        torch.cuda.nvtx.range_push(
+        get_accelerator().range_push(
             "decoder input.size() = {}, weight.size() = {}".format(
                 hidden_states.size(),
                 self.decoder.weight.size()))
         hidden_states = self.decoder(hidden_states) + self.bias
-        torch.cuda.nvtx.range_pop()
+        get_accelerator().range_pop()
         return hidden_states
 
 
@@ -864,22 +860,22 @@ class BertPreTrainedModel(nn.Module):
             archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
         else:
             archive_file = pretrained_model_name_or_path
-        if resolved_archive_file == archive_file:
+        if resolved_archive_file == archive_file:  # noqa: F821
             logger.info("loading archive file {}".format(archive_file))
         else:
             logger.info("loading archive file {} from cache at {}".format(
                 archive_file,
-                resolved_archive_file))
+                resolved_archive_file))  # noqa: F821
         tempdir = None
-        if os.path.isdir(resolved_archive_file) or from_tf:
-            serialization_dir = resolved_archive_file
+        if os.path.isdir(resolved_archive_file) or from_tf:  # noqa: F821
+            serialization_dir = resolved_archive_file  # noqa: F821
         else:
             # Extract archive to temp dir
             tempdir = tempfile.mkdtemp()
             logger.info("extracting archive file {} to temp dir {}".format(
-                resolved_archive_file,
+                resolved_archive_file,  # noqa: F821
                 tempdir))
-            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
+            with tarfile.open(resolved_archive_file, 'r:gz') as archive:  # noqa: F821
                 archive.extractall(tempdir)
             serialization_dir = tempdir
         # Load config
@@ -892,7 +888,7 @@ class BertPreTrainedModel(nn.Module):
             weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
             state_dict = torch.load(
                 weights_path,
-                map_location='cpu' if not torch.cuda.is_available() else None)
+                map_location='cpu' if not get_accelerator().is_available() else None)
         if tempdir:
             # Clean up temp dir
             shutil.rmtree(tempdir)
diff --git a/tests/unit/modelingpreln.py b/tests/unit/modelingpreln.py
index 7661303a414595aeb5fc92581e75ff1d5349a84d..0069add9aa4d8aa115ae244cd28896ee7b9cee5e 100644
--- a/tests/unit/modelingpreln.py
+++ b/tests/unit/modelingpreln.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+# Copyright The Microsoft DeepSpeed Team
 # DeepSpeed note, code taken from commit 3d59216cec89a363649b4fe3d15295ba936ced0f
 # https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/modeling.py
 
@@ -18,8 +20,6 @@
 # limitations under the License.
 """PyTorch BERT model."""
 
-from __future__ import absolute_import, division, print_function, unicode_literals
-
 import copy
 import json
 import logging
@@ -28,20 +28,18 @@ import os
 import shutil
 import tarfile
 import tempfile
-import sys
 from io import open
 
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from torch.utils import checkpoint
-import torch.distributed as dist
+import deepspeed.comm as dist
 
 from torch.nn import Module
-from torch.nn.parameter import Parameter
 import torch.nn.functional as F
 import torch.nn.init as init
-import time
+from deepspeed.accelerator import get_accelerator
 
 #from numba import cuda
 
@@ -187,8 +185,8 @@ ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 class GPUTimer:
     def __init__(self):
         super().__init__()
-        self.start = cuda.event()
-        self.stop = cuda.event()
+        self.start = get_accelerator().Event()  # noqa: F821
+        self.stop = get_accelerator().Event()  # noqa: F821
 
     def record(self):
         self.start.record()
@@ -216,9 +214,7 @@ class LinearActivation(Module):
         self.out_features = out_features
         self.fused_gelu = False
         self.fused_tanh = False
-        if isinstance(act,
-                      str) or (sys.version_info[0] == 2 and isinstance(act,
-                                                                       unicode)):
+        if isinstance(act, str):
             if bias and act == 'gelu':
                 self.fused_gelu = True
             elif bias and act == 'tanh':
@@ -307,10 +303,7 @@ class BertConfig(object):
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
         """
-        if isinstance(vocab_size_or_config_json_file,
-                      str) or (sys.version_info[0] == 2
-                               and isinstance(vocab_size_or_config_json_file,
-                                              unicode)):
+        if isinstance(vocab_size_or_config_json_file, str):
             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
                 json_config = json.loads(reader.read())
             for key, value in json_config.items():
@@ -738,8 +731,8 @@ class BertEncoder(nn.Module):
 
     def get_modules(self, big_node, input):
         for mdl in big_node.named_children():
-            graph.append(mdl)
-            get_modules(self, mdl, input)
+            self.graph.append(mdl)
+            self.get_modules(self, mdl, input)
 
     def forward(self,
                 hidden_states,
@@ -852,12 +845,12 @@ class BertLMPredictionHead(nn.Module):
 
     def forward(self, hidden_states):
         hidden_states = self.transform(hidden_states)
-        torch.cuda.nvtx.range_push(
+        get_accelerator().range_push(
             "decoder input.size() = {}, weight.size() = {}".format(
                 hidden_states.size(),
                 self.decoder.weight.size()))
         hidden_states = self.decoder(hidden_states) + self.bias
-        torch.cuda.nvtx.range_pop()
+        get_accelerator().range_pop()
         return hidden_states
 
 
@@ -959,22 +952,22 @@ class BertPreTrainedModel(nn.Module):
             archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
         else:
             archive_file = pretrained_model_name_or_path
-        if resolved_archive_file == archive_file:
+        if resolved_archive_file == archive_file:  # noqa: F821
             logger.info("loading archive file {}".format(archive_file))
         else:
             logger.info("loading archive file {} from cache at {}".format(
                 archive_file,
-                resolved_archive_file))
+                resolved_archive_file))  # noqa: F821
         tempdir = None
-        if os.path.isdir(resolved_archive_file) or from_tf:
-            serialization_dir = resolved_archive_file
+        if os.path.isdir(resolved_archive_file) or from_tf:  # noqa: F821
+            serialization_dir = resolved_archive_file  # noqa: F821
         else:
             # Extract archive to temp dir
             tempdir = tempfile.mkdtemp()
             logger.info("extracting archive file {} to temp dir {}".format(
-                resolved_archive_file,
+                resolved_archive_file,  # noqa: F821
                 tempdir))
-            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
+            with tarfile.open(resolved_archive_file, 'r:gz') as archive:  # noqa: F821
                 archive.extractall(tempdir)
             serialization_dir = tempdir
         # Load config
@@ -987,7 +980,7 @@ class BertPreTrainedModel(nn.Module):
             weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
             state_dict = torch.load(
                 weights_path,
-                map_location='cpu' if not torch.cuda.is_available() else None)
+                map_location='cpu' if not get_accelerator().is_available() else None)
         if tempdir:
             # Clean up temp dir
             shutil.rmtree(tempdir)
diff --git a/tests/unit/moe/test_moe.py b/tests/unit/moe/test_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe5359249dc802fe014ccc10fa5d24fee8007fd1
--- /dev/null
+++ b/tests/unit/moe/test_moe.py
@@ -0,0 +1,84 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import deepspeed
+import pytest
+from unit.common import DistributedTest
+from unit.simple_model import SimplePRMoEModel, SimpleMoEModel, sequence_dataloader
+from unit.util import required_torch_version
+
+
+@pytest.mark.parametrize("ep_size", [2, 4])
+@pytest.mark.parametrize("use_residual", [True, False])
+class TestMoE(DistributedTest):
+    world_size = 4
+
+    def test(self, ep_size, use_residual):
+        if not required_torch_version():
+            pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
+
+        config_dict = {
+            "train_batch_size": 8,
+            "steps_per_print": 1,
+            "fp16": {
+                "enabled": True
+            }
+        }
+        hidden_dim = 16
+
+        # E+D -- ep_size = 2
+        # E only -- ep_size = 4
+        model = SimpleMoEModel(hidden_dim, ep_size=ep_size, use_residual=use_residual)
+        optimizer = torch.optim.AdamW(params=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              optimizer=optimizer,
+                                              dist_init_required=False)
+        #dist_init_required=False -- parameterize to True/False?
+
+        data_loader = sequence_dataloader(model=model,
+                                          total_samples=50,
+                                          hidden_dim=hidden_dim,
+                                          device=model.device)
+
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+@pytest.mark.parametrize("ep_size, use_residual", [(2, True), (2, False)])
+class TestPRMoE(DistributedTest):
+    world_size = 4
+
+    def test(self, ep_size, use_residual):
+        if not required_torch_version():
+            pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
+
+        config_dict = {
+            "train_batch_size": 8,
+            "steps_per_print": 1,
+            "fp16": {
+                "enabled": True
+            }
+        }
+        hidden_dim = 16
+
+        # E+D -- ep_size = 2
+        # E only -- ep_size = 4
+        model = SimplePRMoEModel(hidden_dim, ep_size=ep_size, use_residual=use_residual)
+        optimizer = torch.optim.AdamW(params=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              optimizer=optimizer,
+                                              dist_init_required=False)
+
+        data_loader = sequence_dataloader(model=model,
+                                          total_samples=50,
+                                          hidden_dim=hidden_dim,
+                                          device=model.device)
+
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
diff --git a/tests/unit/moe/test_moe_tp.py b/tests/unit/moe/test_moe_tp.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba63a102a0edf6a67e2c085bf0035c13e2f00da5
--- /dev/null
+++ b/tests/unit/moe/test_moe_tp.py
@@ -0,0 +1,98 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import deepspeed
+import pytest
+from unit.common import DistributedTest
+from unit.util import required_torch_version
+from deepspeed.moe.layer import MoE
+
+
+class MPU():
+    def __init__(self, tp_world_size):
+        self.rank = deepspeed.comm.get_rank()
+        self.world_size = deepspeed.comm.get_world_size()
+        self.tp_world_size = tp_world_size
+
+        for i in range(0, self.world_size, tp_world_size):
+            ranks = range(i, i + tp_world_size)
+            group = deepspeed.comm.new_group(ranks)
+            if self.rank in ranks:
+                self.tp_group = group
+
+        for i in range(0, tp_world_size):
+            ranks = range(i, self.world_size, tp_world_size)
+            group = deepspeed.comm.new_group(ranks)
+            if self.rank in ranks:
+                self.dp_group = group
+
+    def get_model_parallel_rank(self):
+        return self.rank % self.tp_world_size
+
+    def get_model_parallel_world_size(self):
+        return self.tp_world_size
+
+    def get_data_parallel_rank(self):
+        return self.rank // self.tp_world_size
+
+    def get_data_parallel_world_size(self):
+        return self.world_size // self.tp_world_size
+
+    def get_data_parallel_group(self):
+        return self.dp_group
+
+    def get_model_parallel_group(self):
+        return self.tp_group
+
+
+@pytest.mark.parametrize("ep_size, tp_size", [(1, 2), (1, 4), (2, 2)])
+@pytest.mark.parametrize("enable_expert_tp", [True, False])
+@pytest.mark.parametrize("use_residual", [True, False])
+class TestMOETensorParallel(DistributedTest):
+    world_size = 4
+
+    def test(self, ep_size, tp_size, enable_expert_tp, use_residual):
+        # TODO: replace this with a true parallel mlp in the future
+        # and run convergence tests
+        if not required_torch_version():
+            pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
+
+        config_dict = {
+            "train_batch_size": 8,
+            "steps_per_print": 1,
+            "fp16": {
+                "enabled": True
+            }
+        }
+        hidden_dim = 16
+
+        tensor_parallel_expert = torch.nn.Sequential(
+            torch.nn.Linear(hidden_dim,
+                            4 * hidden_dim // tp_size),
+            torch.nn.ReLU(),
+            torch.nn.Linear(4 * hidden_dim // tp_size,
+                            hidden_dim))
+
+        # set num experts to world size
+        world_size = deepspeed.comm.get_world_size()
+        model = MoE(
+            hidden_size=hidden_dim,
+            expert=tensor_parallel_expert,
+            num_experts=world_size,
+            ep_size=ep_size,
+            use_residual=use_residual,
+            enable_expert_tensor_parallelism=enable_expert_tp,
+        )
+        optimizer = torch.optim.AdamW(params=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              optimizer=optimizer,
+                                              dist_init_required=False,
+                                              mpu=MPU(tp_size))
+
+        assert model.num_local_experts == world_size // ep_size
+        if enable_expert_tp:
+            assert deepspeed.utils.groups._get_expert_model_parallel_world_size(
+            ) == tp_size
+        else:
+            assert deepspeed.utils.groups._get_expert_model_parallel_world_size() == 1
diff --git a/tests/unit/monitor/test_monitor.py b/tests/unit/monitor/test_monitor.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cf10619661c972e359f067bdc34ecc9cec56e58
--- /dev/null
+++ b/tests/unit/monitor/test_monitor.py
@@ -0,0 +1,96 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from deepspeed.monitor.tensorboard import TensorBoardMonitor
+from deepspeed.monitor.wandb import WandbMonitor
+from deepspeed.monitor.csv_monitor import csvMonitor
+from deepspeed.monitor.config import DeepSpeedMonitorConfig
+
+from unit.common import DistributedTest
+from deepspeed.runtime.config import DeepSpeedConfig
+
+
+class TestTensorBoard(DistributedTest):
+    world_size = 2
+
+    def test_tensorboard(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "tensorboard": {
+                "enabled": True,
+                "output_path": "test_output/ds_logs/",
+                "job_name": "test"
+            }
+        }
+        ds_config = DeepSpeedConfig(config_dict)
+        tb_monitor = TensorBoardMonitor(ds_config.monitor_config.tensorboard)
+        assert tb_monitor.enabled == True
+        assert tb_monitor.output_path == "test_output/ds_logs/"
+        assert tb_monitor.job_name == "test"
+
+    def test_empty_tensorboard(self):
+        config_dict = {"train_batch_size": 2, "tensorboard": {}}
+        ds_config = DeepSpeedConfig(config_dict)
+        tb_monitor = TensorBoardMonitor(ds_config.monitor_config.tensorboard)
+        defaults = DeepSpeedMonitorConfig().tensorboard
+        assert tb_monitor.enabled == defaults.enabled
+        assert tb_monitor.output_path == defaults.output_path
+        assert tb_monitor.job_name == defaults.job_name
+
+
+class TestWandB(DistributedTest):
+    world_size = 2
+
+    def test_wandb(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "wandb": {
+                "enabled": False,
+                "group": "my_group",
+                "team": "my_team",
+                "project": "my_project"
+            }
+        }
+        ds_config = DeepSpeedConfig(config_dict)
+        wandb_monitor = WandbMonitor(ds_config.monitor_config.wandb)
+        assert wandb_monitor.enabled == False
+        assert wandb_monitor.group == "my_group"
+        assert wandb_monitor.team == "my_team"
+        assert wandb_monitor.project == "my_project"
+
+    def test_empty_wandb(self):
+        config_dict = {"train_batch_size": 2, "wandb": {}}
+        ds_config = DeepSpeedConfig(config_dict)
+        wandb_monitor = WandbMonitor(ds_config.monitor_config.wandb)
+        defaults = DeepSpeedMonitorConfig().wandb
+        assert wandb_monitor.enabled == defaults.enabled
+        assert wandb_monitor.group == defaults.group
+        assert wandb_monitor.team == defaults.team
+        assert wandb_monitor.project == defaults.project
+
+
+class TestCSVMonitor(DistributedTest):
+    world_size = 2
+
+    def test_csv_monitor(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "csv_monitor": {
+                "enabled": True,
+                "output_path": "test_output/ds_logs/",
+                "job_name": "test"
+            }
+        }
+        ds_config = DeepSpeedConfig(config_dict)
+        csv_monitor = csvMonitor(ds_config.monitor_config.csv_monitor)
+        assert csv_monitor.enabled == True
+        assert csv_monitor.output_path == "test_output/ds_logs/"
+        assert csv_monitor.job_name == "test"
+
+    def test_empty_csv_monitor(self):
+        config_dict = {"train_batch_size": 2, "csv_monitor": {}}
+        ds_config = DeepSpeedConfig(config_dict)
+        csv_monitor = csvMonitor(ds_config.monitor_config.csv_monitor)
+        defaults = DeepSpeedMonitorConfig().csv_monitor
+        assert csv_monitor.enabled == defaults.enabled
+        assert csv_monitor.output_path == defaults.output_path
+        assert csv_monitor.job_name == defaults.job_name
diff --git a/tests/unit/multi_output_model.py b/tests/unit/multi_output_model.py
index 7caf6f7de75fda2900daf8c689877ece5231f309..8993813aa5453533cc4cbd05e165d8c47f536008 100644
--- a/tests/unit/multi_output_model.py
+++ b/tests/unit/multi_output_model.py
@@ -1,6 +1,5 @@
-import os
-import json
-import argparse
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import torch
 
 
diff --git a/tests/unit/ops/accelerators/test_accelerator_backward.py b/tests/unit/ops/accelerators/test_accelerator_backward.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad26daeb698c40bb22a7688d9ff2bb3f52c14ca8
--- /dev/null
+++ b/tests/unit/ops/accelerators/test_accelerator_backward.py
@@ -0,0 +1,345 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import math
+import numpy as np
+import torch
+import pytest
+import random
+import copy
+from torch import nn
+from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
+from deepspeed.accelerator import get_accelerator
+from unit.modeling import BertConfig, BertLayerNorm, BertEncoder as BertEncoderPostln
+from unit.modelingpreln import BertEncoder as BertEncoderPreln
+from unit.common import DistributedTest
+
+#if not deepspeed.ops.__installed_ops__['transformer']:
+#pytest.skip(
+#    "transformer kernels are temporarily disabled because of unexplained failures",
+#    allow_module_level=True)
+
+
+def check_equal(first, second, atol=1e-2, verbose=False):
+    diction_x = {}
+    diction_y = {}
+
+    if verbose:
+        for i, (x, y) in enumerate(zip(first, second)):
+            print(x[1], y[1])
+
+    for i, (x, y) in enumerate(zip(first, second)):
+        k = 0
+        while (diction_x.get((k, x[1])) is not None):
+            k = k + 1
+        diction_x[k, x[1]] = x[0]
+        k = 0
+        while (diction_y.get((k, y[1])) is not None):
+            k = k + 1
+        diction_y[k, y[1]] = y[0]
+    if verbose:
+        print()
+        for i, (x, y) in enumerate(zip(diction_x, diction_y)):
+            print(x, y)
+
+    for i, (x, y) in enumerate(zip(diction_x, diction_y)):
+        if (x[0] == 1): continue
+        if verbose:
+            print("checking ", x[1], ":")
+        y = diction_y[x[0], x[1]]
+        x = diction_x[x[0], x[1]]
+
+        if verbose:
+            print(((x == float('inf')).nonzero(as_tuple=True)[0]))
+            print(((y == float('inf')).nonzero(as_tuple=True)[0]))
+        x = x.cpu().detach().numpy()
+        y = y.cpu().detach().numpy()
+
+        avgx = np.sum(abs(x), dtype=float)
+        countx = x.shape[0]
+        for i in range(len(x.shape) - 1):
+            countx *= x.shape[i + 1]
+            avgx = np.sum(avgx)
+        tolerance = 1
+        if avgx != float('inf') and avgx != -float('inf'):
+            avgx = avgx / countx
+            tolerance = avgx * atol
+        if verbose:
+            print("tolerance is ", tolerance)
+            x = x.flatten()
+            y = y.flatten()
+            print("x = {}".format(x))
+            print("y = {}".format(y))
+            if any(x == float('inf')) or any(x == -float('inf')):
+                print("found infinity in x")
+            if any(y == float('inf')) or any(y == -float('inf')):
+                print("found infinity in y")
+            print(np.linalg.norm(x.astype('float64')))
+            print(np.linalg.norm(y.astype('float64')))
+            print('-' * 80)
+        #toler = np.linalg.norm(x.astype('float64')) * 0.0005
+        np.testing.assert_allclose(x, y, err_msg="Index: {}".format(i), atol=tolerance)
+
+
+def zero_grad(variables):
+    for variable in variables:
+        variable.grad.zero_()
+
+
+device = torch.device(get_accelerator().device_name())
+kwargs_fp32 = {'dtype': torch.float, 'device': device, 'requires_grad': True}
+kwargs_fp16 = {'dtype': torch.half, 'device': device, 'requires_grad': True}
+
+
+class DSEncoder(nn.Module):
+    def __init__(self, config, weights, biases):
+        super(DSEncoder, self).__init__()
+        self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.layer = nn.ModuleList([
+            copy.deepcopy(DeepSpeedTransformerLayer(config,
+                                                    weights,
+                                                    biases))
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.grads = []
+        self.pre_or_post = config.pre_layer_norm
+
+    def forward(self,
+                hidden_states,
+                attention_mask,
+                output_all_encoded_layers=True,
+                checkpoint_activations=False):
+        all_encoder_layers = []
+
+        def custom(start, end):
+            def custom_forward(*inputs):
+                layers = self.layer[start:end]
+                x_ = inputs[0]
+                for layer in layers:
+                    x_ = layer(x_, inputs[1])
+                return x_
+
+            return custom_forward
+
+        if checkpoint_activations:
+            l = 0
+            num_layers = len(self.layer)
+            chunk_length = math.ceil(math.sqrt(num_layers))
+            while l < num_layers:
+                hidden_states = checkpoint.checkpoint(custom(l,  # noqa: F821
+                                                             l + chunk_length),
+                                                      hidden_states,
+                                                      attention_mask * 1)
+                l += chunk_length
+            # decoder layers
+        else:
+            for i, layer_module in enumerate(self.layer):
+                hidden_states = layer_module(hidden_states,
+                                             attention_mask,
+                                             grads=self.grads)
+                hidden_states.register_hook(
+                    lambda x,
+                    self=self: self.grads.append([x,
+                                                  "hidden_state"]))
+
+                if output_all_encoded_layers:
+                    all_encoder_layers.append(hidden_states)
+
+        if not output_all_encoded_layers or checkpoint_activations:
+            if (self.pre_or_post):
+                hidden_states = self.FinalLayerNorm(hidden_states)
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+    def get_grads(self):
+        return self.grads
+
+
+def create_models(ds_config):
+    bert_config = BertConfig(vocab_size_or_config_json_file=119547,
+                             hidden_size=ds_config.hidden_size,
+                             num_hidden_layers=ds_config.num_hidden_layers,
+                             num_attention_heads=ds_config.heads,
+                             intermediate_size=ds_config.intermediate_size,
+                             hidden_act="gelu",
+                             hidden_dropout_prob=ds_config.hidden_dropout_ratio,
+                             attention_probs_dropout_prob=ds_config.attn_dropout_ratio,
+                             max_position_embeddings=512,
+                             type_vocab_size=2,
+                             initializer_range=ds_config.initializer_range)
+
+    weights = []
+    biases = []
+
+    for i in range(4):
+        weights.append(
+            nn.Parameter(torch.Tensor(ds_config.hidden_size,
+                                      ds_config.hidden_size)))
+        weights[i].data.normal_(mean=0.0, std=ds_config.initializer_range)
+
+    weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
+    weights[4].data.fill_(1.0)
+    weights.append(
+        nn.Parameter(torch.Tensor(ds_config.intermediate_size,
+                                  ds_config.hidden_size)))
+    weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range)
+    weights.append(
+        nn.Parameter(torch.Tensor(ds_config.hidden_size,
+                                  ds_config.intermediate_size)))
+    weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range)
+    weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
+    weights[7].data.fill_(1.0)
+
+    biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
+    biases[0].data.zero_()
+    for i in range(4):
+        biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
+        biases[i + 1].data.zero_()
+    biases.append(nn.Parameter(torch.Tensor(ds_config.intermediate_size)))
+    biases[5].data.zero_()
+    biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
+    biases[6].data.zero_()
+    biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
+    biases[7].data.zero_()
+
+    if (ds_config.pre_layer_norm):
+        bert_encoder = BertEncoderPreln(bert_config, weights, biases)
+    else:
+        bert_encoder = BertEncoderPostln(bert_config, weights, biases)
+    ds_encoder = DSEncoder(ds_config, weights, biases)
+
+    if ds_config.fp16:
+        bert_encoder.half()
+        ds_encoder.half()
+
+    bert_encoder.to(get_accelerator().device_name())
+    ds_encoder.to(get_accelerator().device_name())
+
+    return bert_encoder, ds_encoder
+
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+
+def run_backward(ds_config, seq_len, atol=1e-2, verbose=False):
+    set_seed(123)
+    bert_encoder, ds_encoder = create_models(ds_config)
+
+    # prepare test data
+    kwargs = kwargs_fp16 if ds_config.fp16 else kwargs_fp32
+    hidden_states = torch.randn(ds_config.batch_size,
+                                seq_len,
+                                ds_config.hidden_size,
+                                **kwargs)
+    input_mask = torch.randn(ds_config.batch_size, 1, 1, seq_len, **kwargs)
+    Y = torch.randn(ds_config.batch_size, seq_len, ds_config.hidden_size, **kwargs)
+
+    # run baseline
+    base_results = bert_encoder(hidden_states,
+                                input_mask,
+                                output_all_encoded_layers=False,
+                                checkpoint_activations=False)
+
+    loss = (Y - base_results[0]).pow(2).sum() / 64
+    loss.backward()
+    base_grads = bert_encoder.get_grads()
+
+    # run ds
+    ds_results = ds_encoder(hidden_states,
+                            input_mask,
+                            output_all_encoded_layers=False,
+                            checkpoint_activations=False)
+
+    loss = (Y - ds_results[0]).pow(2).sum() / 64
+    loss.backward()
+    ds_grads = ds_encoder.get_grads()
+
+    # check grads
+    check_equal(base_grads, ds_grads, atol=atol, verbose=verbose)
+
+
+#test_backward[3-1024-120-16-24-True-True-0.05]
+#test_backward[3-1024-52-16-24-False-True-0.2]
+# 3-128-54-2-24-False-True-0.2
+@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol',
+                         [
+                             (64,160,128,2,24,False,True, 0.2),
+                             (64,1600,128,2,4,False,True, 0.2),
+                             (8,1600,128,25,3,True,True, 0.05),
+                             (8,160,128,2,3,True,True, 0.1),
+                             (8,1600,128,2,3,True,True, 0.05),
+                             #(3,1024,119,16,24,True,False, 0.05),
+                             #(3,1024,115,16,24,True,True, 0.05),
+                             #(1024,128,10,2,2,False,False, 0.1),
+                             #(3,1024,52,16,24,False,True, 0.2),
+                             #(3,128,51,2,24,False,False, 0.1),
+                             #(3,128,54,2,24,False,True, 0.2),
+                         ]) # yapf: disable
+class TestCUDABackward(DistributedTest):
+    world_size = 1
+
+    def test_backward(self,
+                      batch_size,
+                      hidden_size,
+                      seq_len,
+                      heads,
+                      num_layers,
+                      is_preln,
+                      use_fp16,
+                      atol):
+        # Only run fp16 test cases on devices with FP16 capability.
+        if not get_accelerator().is_fp16_supported() and (use_fp16 is True
+                                                          or is_preln is False):
+            return
+
+        ds_config = DeepSpeedTransformerConfig()
+        ds_config.layer_id = None
+        ds_config.batch_size = batch_size
+        ds_config.hidden_size = hidden_size
+        ds_config.intermediate_size = hidden_size
+        ds_config.heads = heads
+        ds_config.attn_dropout_ratio = 0.0
+        ds_config.hidden_dropout_ratio = 0.0
+        ds_config.num_hidden_layers = num_layers
+        ds_config.pre_layer_norm = is_preln
+        ds_config.initializer_range = 0.02
+        ds_config.fp16 = use_fp16
+
+        run_backward(ds_config, seq_len, atol=atol, verbose=True)
+
+    #                         [
+    #                             (3,1024,128,16,24,True,False, 0.07),
+    #                             (3,1024,128,16,24,True,True, 0.05),
+    #                             (3,1024,128,16,24,False,False, 0.1),
+    #                             (3,1024,128,16,24,False,True, 0.2),
+    #                         ]) # yapf: disable
+    #def test_backward_stochastic(batch_size,
+    #                             hidden_size,
+    #                             seq_len,
+    #                             heads,
+    #                             num_layers,
+    #                             is_preln,
+    #                             use_fp16,
+    #                             atol):
+    #    # Only run fp16 test cases on devices with FP16 capability.
+    #    if not get_accelerator().is_fp16_supported() and use_fp16 is True:
+    #        return
+    #
+    #    ds_config = DeepSpeedTransformerConfig()
+    #    ds_config.layer_id = None
+    #    ds_config.batch_size = batch_size
+    #    ds_config.hidden_size = hidden_size
+    #    ds_config.intermediate_size = 4 * hidden_size
+    #    ds_config.max_seq_length = seq_len
+    #    ds_config.heads = heads
+    #    ds_config.attn_dropout_ratio = 0.0
+    #    ds_config.hidden_dropout_ratio = 0.0
+    #    ds_config.num_hidden_layers = num_layers
+    #    ds_config.pre_layer_norm = is_preln
+    #    ds_config.initializer_range = 0.02
+    #    ds_config.fp16 = use_fp16
+    #    ds_config.stochastic_mode = True
+    #
+    #    run_backward(ds_config, atol=atol)
diff --git a/tests/unit/ops/accelerators/test_accelerator_forward.py b/tests/unit/ops/accelerators/test_accelerator_forward.py
new file mode 100644
index 0000000000000000000000000000000000000000..83ff70b9bcf1b885a487cc3e98f2f25a0ea3fc92
--- /dev/null
+++ b/tests/unit/ops/accelerators/test_accelerator_forward.py
@@ -0,0 +1,341 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import math
+import numpy as np
+import torch
+import pytest
+import random
+import copy
+from torch import nn
+from unit.modelingpreln import BertEncoder as BertEncoderPreln
+from unit.modeling import BertLayerNorm, BertConfig, BertEncoder as BertEncoderPostln
+from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
+from deepspeed.accelerator import get_accelerator
+from unit.common import DistributedTest
+
+
+def check_equal(first, second, atol=1e-2, verbose=False):
+    if verbose:
+        print()
+    for i, (x, y) in enumerate(zip(first, second)):
+        x = x[0].cpu().detach().numpy()
+        y = y[0].cpu().detach().numpy()
+        if verbose:
+            print("x = {}".format(x.flatten()))
+            print("y = {}".format(y.flatten()))
+            print('-' * 80)
+        np.testing.assert_allclose(x, y, err_msg="Index: {}".format(i), atol=atol)
+
+
+def zero_grad(variables):
+    for variable in variables:
+        variable.grad.zero_()
+
+
+device = torch.device(get_accelerator().device_name())
+kwargs_fp32 = {'dtype': torch.float, 'device': device, 'requires_grad': True}
+kwargs_fp16 = {'dtype': torch.half, 'device': device, 'requires_grad': True}
+
+
+class DSEncoder(nn.Module):
+    def __init__(self, config, weights, biases):
+        super(DSEncoder, self).__init__()
+        self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.layer = nn.ModuleList([
+            copy.deepcopy(DeepSpeedTransformerLayer(config,
+                                                    weights,
+                                                    biases))
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.grads = []
+        self.pre_or_post = config.pre_layer_norm
+
+    def forward(self,
+                hidden_states,
+                attention_mask,
+                output_all_encoded_layers=True,
+                checkpoint_activations=False):
+        all_encoder_layers = []
+
+        def custom(start, end):
+            def custom_forward(*inputs):
+                layers = self.layer[start:end]
+                x_ = inputs[0]
+                for layer in layers:
+                    x_ = layer(x_, inputs[1])
+                return x_
+
+            return custom_forward
+
+        if checkpoint_activations:
+            l = 0
+            num_layers = len(self.layer)
+            chunk_length = math.ceil(math.sqrt(num_layers))
+            while l < num_layers:
+                hidden_states = checkpoint.checkpoint(custom(l,  # noqa: F821
+                                                             l + chunk_length),
+                                                      hidden_states,
+                                                      attention_mask * 1)
+                l += chunk_length
+            # decoder layers
+        else:
+            for i, layer_module in enumerate(self.layer):
+                hidden_states = layer_module(hidden_states, attention_mask)
+
+                if output_all_encoded_layers:
+                    all_encoder_layers.append(hidden_states)
+
+        if not output_all_encoded_layers or checkpoint_activations:
+            if (self.pre_or_post):
+                hidden_states = self.FinalLayerNorm(hidden_states)
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+
+def create_models(ds_config):
+    bert_config = BertConfig(vocab_size_or_config_json_file=119547,
+                             hidden_size=ds_config.hidden_size,
+                             num_hidden_layers=ds_config.num_hidden_layers,
+                             num_attention_heads=ds_config.heads,
+                             batch_size=ds_config.batch_size,
+                             intermediate_size=ds_config.intermediate_size,
+                             hidden_act="gelu",
+                             hidden_dropout_prob=ds_config.hidden_dropout_ratio,
+                             attention_probs_dropout_prob=ds_config.attn_dropout_ratio,
+                             max_position_embeddings=512,
+                             type_vocab_size=2,
+                             initializer_range=ds_config.initializer_range,
+                             fp16=ds_config.fp16)
+
+    weights = []
+    biases = []
+
+    for i in range(4):
+        weights.append(
+            nn.Parameter(torch.Tensor(ds_config.hidden_size,
+                                      ds_config.hidden_size)))
+        weights[i].data.normal_(mean=0.0, std=ds_config.initializer_range)
+
+    weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
+    weights[4].data.fill_(1.0)
+    weights.append(
+        nn.Parameter(torch.Tensor(ds_config.intermediate_size,
+                                  ds_config.hidden_size)))
+    weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range)
+    weights.append(
+        nn.Parameter(torch.Tensor(ds_config.hidden_size,
+                                  ds_config.intermediate_size)))
+    weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range)
+    weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
+    weights[7].data.fill_(1.0)
+
+    biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
+    biases[0].data.zero_()
+    for i in range(4):
+        biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
+        biases[i + 1].data.zero_()
+    biases.append(nn.Parameter(torch.Tensor(ds_config.intermediate_size)))
+    biases[5].data.zero_()
+    biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
+    biases[6].data.zero_()
+    biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
+    biases[7].data.zero_()
+
+    if (ds_config.pre_layer_norm):
+        bert_encoder = BertEncoderPreln(bert_config, weights, biases)
+    else:
+        bert_encoder = BertEncoderPostln(bert_config, weights, biases)
+    ds_encoder = DSEncoder(ds_config, weights, biases)
+
+    if ds_config.fp16:
+        bert_encoder.half()
+        ds_encoder.half()
+
+    bert_encoder.to(get_accelerator().device_name())
+    ds_encoder.to(get_accelerator().device_name())
+
+    return bert_encoder, ds_encoder
+
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+
+def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None):
+    set_seed(123)
+    bert_encoder, ds_encoder = create_models(ds_config)
+    print("bert_model:11111111111")
+    print(bert_encoder)
+    print("ds_model:2222222222222")
+    print(ds_encoder)
+    bsz = ds_config.batch_size if test_bsz is None else test_bsz
+    #prepare test data
+    kwargs = kwargs_fp16 if ds_config.fp16 else kwargs_fp32
+    hidden_states = torch.randn(bsz, seq_len, ds_config.hidden_size, **kwargs)
+    input_mask = torch.randn(bsz, 1, 1, seq_len, **kwargs)
+
+    # run baseline
+    base_results = bert_encoder(hidden_states,
+                                input_mask,
+                                output_all_encoded_layers=False,
+                                checkpoint_activations=False)
+
+    # run ds
+    ds_results = ds_encoder(hidden_states,
+                            input_mask,
+                            output_all_encoded_layers=False,
+                            checkpoint_activations=False)
+
+    # check forward evaluation
+    check_equal(base_results, ds_results, atol=atol, verbose=verbose)
+
+
+# FP16 test cases can only run on the devices support FP16.
+@pytest.mark.sequential
+@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
+                         [
+                             (64,160,128,2,24,False,True),
+                             #(8,2048,2048,32,1,True,True),
+                             (8,160,128,2,3,True,True),
+                             (8,160,128,2,3,False,True),
+                             (8,1600,128,2,3,True,True),
+                             (8,1600,128,25,3,True,True),
+                             (8,1600,128,25,3,False,True),
+                             (8,256,52,4,3,True,True),
+                             (3,1024,51,16,3,True,False),
+                             (3,1024,54,16,3,True,True),
+                             (8,1024,381,16,3,True,False),
+                             (8,1024,384,16,3,True,True),
+                             (8,1024,384,16,3,True,True),
+                             (8,1024,119,16,3,True,False),
+                             (8,1024,120,16,3,True,True),
+                             (8,1024,509,16,3,True,False),
+                             (8,1024,512,16,3,True,True),
+                             (64,1024,56,16,3,False,False),
+                             (64,1024,53,16,3,False,True),
+                             (64,1024,24,16,3,False,False),
+                             (64,1024,21,16,3,False,True),
+                             (8,1024,384,16,3,False,False),
+                             (8,1024,384,16,3,False,True),
+                             (8,1024,512,16,3,False,False),
+                             (8,1024,511,16,3,False,True),
+                             (8,1536,128,24,3,False,False),
+                             (8,1536,128,24,3,False,True),
+                             (8,2048,128,32,3,False,False),
+                             (8,2048,128,32,3,False,True),
+                             (8,2560,128,40,3,False,False),
+                             (8,2560,128,40,3,False,True),
+                             (8,128,128,2,3,True,False),
+                             (8,128,128,2,3,True,True),
+                             (8,4096,128,64,3,True,True),
+                             (8,8192,128,64,3,False,True),
+                             (1,256,2048,32,3,True,True),
+                         ]) # yapf: disable
+class TestCUDAForward(DistributedTest):
+    world_size = 1
+
+    def test_forward(self,
+                     batch_size,
+                     hidden_size,
+                     seq_len,
+                     heads,
+                     num_layers,
+                     is_preln,
+                     use_fp16):
+        # Only run fp16 test cases on devices with FP16 capability.
+        if not get_accelerator().is_fp16_supported() and use_fp16 is True:
+            return
+
+        ds_config = DeepSpeedTransformerConfig()
+        ds_config.layer_id = None
+        ds_config.batch_size = batch_size
+        ds_config.hidden_size = hidden_size
+        ds_config.intermediate_size = 4 * hidden_size
+        ds_config.heads = heads
+        ds_config.attn_dropout_ratio = 0.0
+        ds_config.hidden_dropout_ratio = 0.0
+        ds_config.num_hidden_layers = num_layers
+        ds_config.pre_layer_norm = is_preln
+        ds_config.initializer_range = 0.02
+        ds_config.fp16 = use_fp16
+
+        run_forward(ds_config, seq_len, atol=3e-2)
+
+
+@pytest.mark.parametrize('batch_size, small_bsz, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
+                         [
+                             (8,3,1024,512,16,3,True,False),
+                             (8,7,1024,512,16,3,True,True),
+                             (8,3,1024,512,16,3,False,False),
+                             (8,7,1024,512,16,3,False,True),
+                         ]) # yapf: disable
+class TestCUDAForwardSmallBatchSize(DistributedTest):
+    world_size = 1
+
+    def test_forward_with_small_bsz(self,
+                                    batch_size,
+                                    small_bsz,
+                                    hidden_size,
+                                    seq_len,
+                                    heads,
+                                    num_layers,
+                                    is_preln,
+                                    use_fp16):
+        # Only run fp16 test cases on devices with FP16 capability.
+        if not get_accelerator().is_fp16_supported() and use_fp16 is True:
+            return
+
+        ds_config = DeepSpeedTransformerConfig()
+        ds_config.layer_id = None
+        ds_config.batch_size = batch_size
+        ds_config.hidden_size = hidden_size
+        ds_config.intermediate_size = 4 * hidden_size
+        ds_config.heads = heads
+        ds_config.attn_dropout_ratio = 0.0
+        ds_config.hidden_dropout_ratio = 0.0
+        ds_config.num_hidden_layers = num_layers
+        ds_config.pre_layer_norm = is_preln
+        ds_config.initializer_range = 0.02
+        ds_config.fp16 = use_fp16
+
+        run_forward(ds_config, seq_len, atol=3e-2, test_bsz=small_bsz)
+
+@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
+                         [
+                             #(64,1024,128,16,3,True,False),
+                             #(64,1024,128,16,3,True,True),
+                             #(64,1024,128,16,3,False,False),
+                             #(64,1024,128,16,3,False,True),
+                         ]) # yapf: disable
+class TestCUDAForwardStochastic(DistributedTest):
+    world_size = 1
+
+    def test_forward_stochastic(self,
+                                batch_size,
+                                hidden_size,
+                                seq_len,
+                                heads,
+                                num_layers,
+                                is_preln,
+                                use_fp16):
+        # Only run fp16 test cases on devices with FP16 capability.
+        if not get_accelerator().is_fp16_supported() and use_fp16 is True:
+            return
+
+        ds_config = DeepSpeedTransformerConfig()
+        ds_config.layer_id = None
+        ds_config.batch_size = batch_size
+        ds_config.hidden_size = hidden_size
+        ds_config.intermediate_size = 4 * hidden_size
+        ds_config.heads = heads
+        ds_config.attn_dropout_ratio = 0.0
+        ds_config.hidden_dropout_ratio = 0.0
+        ds_config.num_hidden_layers = num_layers
+        ds_config.pre_layer_norm = is_preln
+        ds_config.initializer_range = 0.02
+        ds_config.fp16 = use_fp16
+        ds_config.stochastic_mode = True
+
+        run_forward(ds_config, seq_len, atol=7e-2)
diff --git a/tests/unit/ops/adagrad/test_cpu_adagrad.py b/tests/unit/ops/adagrad/test_cpu_adagrad.py
new file mode 100644
index 0000000000000000000000000000000000000000..17001e6bd02124abd5e0b3cf8bea720fa9290296
--- /dev/null
+++ b/tests/unit/ops/adagrad/test_cpu_adagrad.py
@@ -0,0 +1,148 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import numpy as np
+import pytest
+
+import deepspeed
+from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import CPUAdagradBuilder
+from unit.common import DistributedTest
+
+if not deepspeed.ops.__compatible_ops__[CPUAdagradBuilder.NAME]:
+    pytest.skip("cpu-adagrad is not compatible", allow_module_level=True)
+
+
+def check_equal(first, second, atol=1e-2, verbose=False):
+    x = first.detach().numpy()
+    y = second.detach().numpy()
+    if verbose:
+        print("x = {}".format(x.flatten()))
+        print("y = {}".format(y.flatten()))
+        print('-' * 80)
+    np.testing.assert_allclose(x, y, err_msg="param-update mismatch!", atol=atol)
+
+
+class TestCPUAdagrad(DistributedTest):
+    world_size = 1
+    requires_cuda_env = False
+    if not get_accelerator().is_available():
+        init_distributed = False
+        set_dist_env = False
+
+    @pytest.mark.parametrize('model_size',
+                            [
+                                (64),
+                                (22),
+                                (55),
+                                (127),
+                                (1024),
+                                (1048576),
+                                (30000000),
+                            ]) # yapf: disable
+    def test_cpu_adagrad_opt(self, model_size):
+        device = 'cpu'
+        rng_state = torch.get_rng_state()
+        param = torch.nn.Parameter(torch.randn(model_size, device=device))
+        torch.set_rng_state(rng_state)
+        param1 = torch.nn.Parameter(torch.randn(model_size, device=device))
+        torch.set_rng_state(rng_state)
+
+        optimizer = DeepSpeedCPUAdagrad([param])
+        optimizer1 = torch.optim.Adagrad([param1])
+
+        for i in range(10):
+            rng_state = torch.get_rng_state()
+            param.grad = torch.randn(model_size, device=device)
+            torch.set_rng_state(rng_state)
+            param1.grad = torch.randn(model_size, device=device)
+            optimizer.step()
+            optimizer1.step()
+
+        check_equal(param, param1, atol=1e-2, verbose=True)
+
+
+    @pytest.mark.parametrize('model_size,vocabulary_size,dim',
+                            [
+                                (16 * 2, 16 * 4, 16),
+                                (16 * 32, 16 * 256, 16),
+                                (16 * 256, 16 * 16384, 16),
+                            ]) # yapf: disable
+    def test_cpu_adagrad_opt_sparse_embedding(self, model_size, vocabulary_size, dim):
+        device = 'cpu'
+        rng_state = torch.get_rng_state()
+
+        def gen_sparse_grad(vocabulary_size, dim, num_indices, dtype, device):
+            i = torch.randint(vocabulary_size,
+                              size=(1,
+                                    num_indices),
+                              dtype=torch.int64,
+                              device=device)
+            v = torch.randn(num_indices, dim, dtype=dtype, device=device)
+            t = torch.sparse_coo_tensor(i, v, (vocabulary_size, dim), device=device)
+            t = t.coalesce()
+            new_i = (t.indices().view(-1,
+                                      1).repeat(1,
+                                                dim) * dim +
+                     torch.tensor(range(dim))).flatten().unsqueeze(0)
+            new_v = t.values().flatten()
+            new_t = torch.sparse_coo_tensor(new_i,
+                                            new_v,
+                                            (vocabulary_size * dim,
+                                             ),
+                                            device=device)
+            new_t = new_t.coalesce()
+            new_t.requires_grad = False
+            return new_t
+
+        voc_size = vocabulary_size
+        dim = dim
+        num_indices = int(model_size // dim)
+        dtype = torch.float32
+
+        param = torch.nn.Parameter(torch.randn((voc_size * dim,
+                                                ),
+                                               dtype=dtype,
+                                               device=device),
+                                   requires_grad=True)
+        torch.set_rng_state(rng_state)
+        param1 = torch.nn.Parameter(torch.randn((voc_size * dim,
+                                                 ),
+                                                dtype=dtype,
+                                                device=device),
+                                    requires_grad=True)
+        torch.set_rng_state(rng_state)
+
+        optimizer = DeepSpeedCPUAdagrad([param])
+        optimizer1 = torch.optim.Adagrad([param1])
+
+        for i in range(10):
+            torch.set_rng_state(rng_state)
+            param.grad = gen_sparse_grad(voc_size,
+                                         dim,
+                                         num_indices,
+                                         dtype=dtype,
+                                         device=device)
+            torch.set_rng_state(rng_state)
+            param1.grad = gen_sparse_grad(voc_size,
+                                          dim,
+                                          num_indices,
+                                          dtype=dtype,
+                                          device=device)
+            optimizer.step()
+            optimizer1.step()
+
+        check_equal(param, param1, atol=1e-2, verbose=True)
+
+
+class TestCPUAdagradGPUError(DistributedTest):
+    def test_cpu_adagrad_gpu_error(self):
+        model_size = 64
+        device = get_accelerator().device_name(0)  # 'cuda:0' or 'xpu:0'
+        param = torch.nn.Parameter(torch.randn(model_size, device=device))
+        optimizer = DeepSpeedCPUAdagrad([param])
+
+        param.grad = torch.randn(model_size, device=device)
+        with pytest.raises(AssertionError):
+            optimizer.step()
diff --git a/tests/unit/ops/adam/test_adamw.py b/tests/unit/ops/adam/test_adamw.py
new file mode 100644
index 0000000000000000000000000000000000000000..03a7c3ca32669512383238b8fdd7b427cbb71615
--- /dev/null
+++ b/tests/unit/ops/adam/test_adamw.py
@@ -0,0 +1,72 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import deepspeed
+import torch
+import pytest
+
+from deepspeed.ops.adam import FusedAdam
+from deepspeed.ops.adam import DeepSpeedCPUAdam
+from unit.common import DistributedTest
+from unit.simple_model import SimpleModel
+
+# yapf: disable
+#'optimizer, zero_offload, torch_adam, adam_w_mode, resulting_optimizer
+adam_configs = [["AdamW", False, False, False, (FusedAdam, True)],
+                ["AdamW", False, True,  False, (torch.optim.AdamW, None)],
+                ["AdamW", True,  False, False, (DeepSpeedCPUAdam, True)],
+                ["AdamW", True,  True,  False, (torch.optim.AdamW, None)],
+                ["AdamW", False, False, True,  (FusedAdam, True)],
+                ["AdamW", False, True,  True,  (torch.optim.AdamW, None)],
+                ["AdamW", True,  False, True,  (DeepSpeedCPUAdam, True)],
+                ["AdamW", True,  True,  True,  (torch.optim.AdamW, None)],
+                ["Adam",  False, False, False, (FusedAdam, False)],
+                ["Adam",  False, True,  False, (torch.optim.Adam, None)],
+                ["Adam",  True,  False, False, (DeepSpeedCPUAdam, False)],
+                ["Adam",  True,  True,  False, (torch.optim.Adam, None)],
+                ["Adam",  False, False, True,  (FusedAdam, True)],
+                ["Adam",  False, True,  True,  (torch.optim.AdamW, None)],
+                ["Adam",  True,  False, True,  (DeepSpeedCPUAdam, True)],
+                ["Adam",  True,  True,  True,  (torch.optim.AdamW, None)]]
+
+@pytest.mark.parametrize(
+    'optimizer, zero_offload, torch_adam, adam_w_mode, resulting_optimizer',
+    adam_configs)
+class TestAdamConfigs(DistributedTest):
+    world_size = 1
+
+    def test(self,
+             optimizer,
+             zero_offload,
+             torch_adam,
+             adam_w_mode,
+             resulting_optimizer):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": optimizer,
+                "params": {
+                    "lr": 0.00015,
+                    "torch_adam": torch_adam,
+                    "adam_w_mode": adam_w_mode
+                }
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": 2,
+                "cpu_offload": zero_offload
+            }
+        }
+        model = SimpleModel(10)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        # get base optimizer under zero
+        ds_optimizer = model.optimizer.optimizer
+        opt_class, adam_w_mode = resulting_optimizer
+        assert isinstance(ds_optimizer, opt_class)
+        if adam_w_mode in [True, False]:
+            assert ds_optimizer.adam_w_mode == adam_w_mode
diff --git a/tests/unit/ops/adam/test_cpu_adam.py b/tests/unit/ops/adam/test_cpu_adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..d10fb98105a807c599e4f037946be587f052ebdd
--- /dev/null
+++ b/tests/unit/ops/adam/test_cpu_adam.py
@@ -0,0 +1,129 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import numpy as np
+import pytest
+from cpuinfo import get_cpu_info
+
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.adam import FusedAdam
+from deepspeed.ops.op_builder import CPUAdamBuilder
+from unit.common import DistributedTest
+
+if not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+    pytest.skip("cpu-adam is not compatible", allow_module_level=True)
+
+pytest.cpu_vendor = get_cpu_info()["vendor_id_raw"].lower()
+
+
+def check_equal(first, second, atol=1e-2, verbose=False):
+    x = first.detach().numpy()
+    y = second.detach().numpy()
+    print("ATOL", atol)
+    if verbose:
+        print("x = {}".format(x.flatten()))
+        print("y = {}".format(y.flatten()))
+        print('-' * 80)
+    np.testing.assert_allclose(x, y, err_msg="param-update mismatch!", atol=atol)
+
+
+def _compare_optimizers(model_size, param1, optimizer1, param2, optimizer2):
+    for i in range(10):
+        param1.grad = torch.randn(model_size, device=param1.device).to(param1.dtype)
+        param2.grad = param1.grad.clone().detach().to(device=param2.device,
+                                                      dtype=param2.dtype)
+
+        optimizer1.step()
+        optimizer2.step()
+
+    tolerance = param1.float().norm().detach().numpy() * 1e-2
+    check_equal(param1.float().norm(),
+                param2.float().cpu().norm(),
+                atol=tolerance,
+                verbose=True)
+
+
+@pytest.mark.parametrize('dtype', [torch.half, torch.float], ids=["fp16", "fp32"])
+@pytest.mark.parametrize('model_size',
+                         [
+                             (64),
+                             (22),
+                             #(55),
+                             (128),
+                             (1024),
+                             (1048576),
+                         ]) # yapf: disable
+class TestCPUAdam(DistributedTest):
+    world_size = 1
+    requires_cuda_env = False
+    if not get_accelerator().is_available():
+        init_distributed = False
+        set_dist_env = False
+
+    @pytest.mark.skipif(not get_accelerator().is_available(),
+                        reason="only supported in CUDA environments.")
+    def test_fused_adam_equal(self, dtype, model_size):
+        if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
+            pytest.skip("cpu-adam with half precision not supported on AMD CPUs")
+
+        from deepspeed.ops.adam import DeepSpeedCPUAdam
+
+        cpu_data = torch.randn(model_size, device='cpu').to(dtype)
+        cpu_param = torch.nn.Parameter(cpu_data)
+        cuda_param = torch.nn.Parameter(cpu_data.to(get_accelerator().device_name()))
+
+        # tolerance = cpu_param.float().norm().detach().numpy() * 1e-2
+        # check_equal(cpu_param.float().norm(),
+        #             cuda_param.float().cpu().norm(),
+        #             atol=tolerance,
+        #             verbose=True)
+
+        cpu_optimizer = DeepSpeedCPUAdam([cpu_param])
+        cuda_optimizer = FusedAdam([cuda_param])
+
+        _compare_optimizers(model_size=model_size,
+                            param1=cpu_param,
+                            optimizer1=cpu_optimizer,
+                            param2=cuda_param,
+                            optimizer2=cuda_optimizer)
+
+    def test_torch_adamw_equal(self, dtype, model_size):
+        if get_accelerator().is_available():
+            if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
+                pytest.skip("cpu-adam with half precision not supported on AMD CPUs")
+            ref_param_device = get_accelerator().device_name()
+        else:
+            if dtype == torch.half:
+                pytest.skip(
+                    "torch.optim.AdamW with half precision only supported in CUDA environments."
+                )
+            ref_param_device = 'cpu'
+
+            from deepspeed.ops.adam import DeepSpeedCPUAdam
+
+            cpu_data = torch.randn(model_size, device='cpu').to(dtype)
+            cpu_param = torch.nn.Parameter(cpu_data)
+            ref_param = torch.nn.Parameter(cpu_data.to(ref_param_device))
+
+            cpu_optimizer = DeepSpeedCPUAdam([cpu_param])
+            ref_optimizer = torch.optim.AdamW([ref_param])
+
+            _compare_optimizers(model_size=model_size,
+                                param1=cpu_param,
+                                optimizer1=cpu_optimizer,
+                                param2=ref_param,
+                                optimizer2=ref_optimizer)
+
+
+class TestCPUAdamGPUError(DistributedTest):
+    def test_cpu_adam_gpu_error(self):
+        model_size = 64
+        from deepspeed.ops.adam import DeepSpeedCPUAdam
+        device = get_accelerator().device_name(0)  # 'cuda:0' or 'xpu:0'
+        param = torch.nn.Parameter(torch.randn(model_size, device=device))
+        optimizer = DeepSpeedCPUAdam([param])
+
+        param.grad = torch.randn(model_size, device=device)
+        with pytest.raises(AssertionError):
+            optimizer.step()
diff --git a/tests/unit/ops/aio/test_aio.py b/tests/unit/ops/aio/test_aio.py
new file mode 100644
index 0000000000000000000000000000000000000000..86265ab15ef941f317b9bb3158c84a14d26d30f6
--- /dev/null
+++ b/tests/unit/ops/aio/test_aio.py
@@ -0,0 +1,380 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import pytest
+import os
+import filecmp
+import torch
+import deepspeed
+import deepspeed.comm as dist
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import AsyncIOBuilder
+from unit.common import DistributedTest
+
+KILO_BYTE = 1024
+BLOCK_SIZE = KILO_BYTE
+QUEUE_DEPTH = 2
+IO_SIZE = 4 * BLOCK_SIZE
+IO_PARALLEL = 2
+
+if not deepspeed.ops.__compatible_ops__[AsyncIOBuilder.NAME]:
+    pytest.skip('Skip tests since async-io is not compatible', allow_module_level=True)
+
+
+def _skip_for_invalid_environment(use_cuda_device=True, use_cuda_pinned_tensor=True):
+    if not get_accelerator().is_available():
+        if use_cuda_device:
+            pytest.skip("GPU tensors only supported in CUDA environments.")
+        if use_cuda_pinned_tensor:
+            pytest.skip("CUDA-pinned tensors only supported in CUDA environments.")
+
+
+def _get_local_rank():
+    if get_accelerator().is_available():
+        return dist.get_rank()
+    return 0
+
+
+def _do_ref_write(tmpdir, index=0):
+    file_suffix = f'{_get_local_rank()}_{index}'
+    ref_file = os.path.join(tmpdir, f'_py_random_{file_suffix}.pt')
+    ref_buffer = os.urandom(IO_SIZE)
+    with open(ref_file, 'wb') as f:
+        f.write(ref_buffer)
+
+    return ref_file, ref_buffer
+
+
+def _get_test_write_file(tmpdir, index):
+    file_suffix = f'{_get_local_rank()}_{index}'
+    return os.path.join(tmpdir, f'_aio_write_random_{file_suffix}.pt')
+
+
+def _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer, index=0):
+    test_file = _get_test_write_file(tmpdir, index)
+    test_buffer = get_accelerator().ByteTensor(list(ref_buffer))
+    return test_file, test_buffer
+
+
+def _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, aio_handle=None, index=0):
+    test_file = _get_test_write_file(tmpdir, index)
+    if aio_handle is None:
+        test_buffer = get_accelerator().pin_memory(torch.ByteTensor(list(ref_buffer)))
+    else:
+        tmp_buffer = torch.ByteTensor(list(ref_buffer))
+        test_buffer = aio_handle.new_cpu_locked_tensor(len(ref_buffer), tmp_buffer)
+        test_buffer.data.copy_(tmp_buffer)
+
+    return test_file, test_buffer
+
+
+def _validate_handle_state(handle, single_submit, overlap_events):
+    assert handle.get_single_submit() == single_submit
+    assert handle.get_overlap_events() == overlap_events
+    assert handle.get_thread_count() == IO_PARALLEL
+    assert handle.get_block_size() == BLOCK_SIZE
+    assert handle.get_queue_depth() == QUEUE_DEPTH
+
+
+@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
+@pytest.mark.parametrize("single_submit", [True, False])
+@pytest.mark.parametrize("overlap_events", [True, False])
+class TestRead(DistributedTest):
+    world_size = 1
+    requires_cuda_env = False
+    if not get_accelerator().is_available():
+        init_distributed = False
+        set_dist_env = False
+
+    def test_parallel_read(self,
+                           tmpdir,
+                           use_cuda_pinned_tensor,
+                           single_submit,
+                           overlap_events):
+        _skip_for_invalid_environment(use_cuda_device=False,
+                                      use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
+                                               QUEUE_DEPTH,
+                                               single_submit,
+                                               overlap_events,
+                                               IO_PARALLEL)
+
+        if use_cuda_pinned_tensor:
+            aio_buffer = get_accelerator().pin_memory(
+                torch.empty(IO_SIZE,
+                            dtype=torch.uint8,
+                            device='cpu'))
+        else:
+            aio_buffer = h.new_cpu_locked_tensor(IO_SIZE,
+                                                 torch.empty(0,
+                                                             dtype=torch.uint8))
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        ref_file, _ = _do_ref_write(tmpdir)
+        read_status = h.sync_pread(aio_buffer, ref_file)
+        assert read_status == 1
+
+        with open(ref_file, 'rb') as f:
+            ref_buffer = list(f.read())
+        assert ref_buffer == aio_buffer.tolist()
+
+        if not use_cuda_pinned_tensor:
+            h.free_cpu_locked_tensor(aio_buffer)
+
+    @pytest.mark.parametrize("cuda_device", [True, False])
+    def test_async_read(self,
+                        tmpdir,
+                        use_cuda_pinned_tensor,
+                        single_submit,
+                        overlap_events,
+                        cuda_device):
+        _skip_for_invalid_environment(use_cuda_device=cuda_device,
+                                      use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+
+        use_cpu_locked_tensor = False
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
+                                               QUEUE_DEPTH,
+                                               single_submit,
+                                               overlap_events,
+                                               IO_PARALLEL)
+
+        if cuda_device:
+            aio_buffer = torch.empty(IO_SIZE,
+                                     dtype=torch.uint8,
+                                     device=get_accelerator().device_name())
+        elif use_cuda_pinned_tensor:
+            aio_buffer = get_accelerator().pin_memory(
+                torch.empty(IO_SIZE,
+                            dtype=torch.uint8,
+                            device='cpu'))
+        else:
+            aio_buffer = h.new_cpu_locked_tensor(IO_SIZE,
+                                                 torch.empty(0,
+                                                             dtype=torch.uint8))
+            use_cpu_locked_tensor = True
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        ref_file, _ = _do_ref_write(tmpdir)
+        read_status = h.async_pread(aio_buffer, ref_file)
+        assert read_status == 0
+
+        wait_status = h.wait()
+        assert wait_status == 1
+
+        with open(ref_file, 'rb') as f:
+            ref_buffer = list(f.read())
+        assert ref_buffer == aio_buffer.tolist()
+
+        if use_cpu_locked_tensor:
+            h.free_cpu_locked_tensor(aio_buffer)
+
+
+@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
+@pytest.mark.parametrize("single_submit", [True, False])
+@pytest.mark.parametrize("overlap_events", [True, False])
+class TestWrite(DistributedTest):
+    world_size = 1
+    requires_cuda_env = False
+    if not get_accelerator().is_available():
+        init_distributed = False
+        set_dist_env = False
+
+    def test_parallel_write(self,
+                            tmpdir,
+                            use_cuda_pinned_tensor,
+                            single_submit,
+                            overlap_events):
+        _skip_for_invalid_environment(use_cuda_device=False,
+                                      use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+
+        ref_file, ref_buffer = _do_ref_write(tmpdir)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
+                                               QUEUE_DEPTH,
+                                               single_submit,
+                                               overlap_events,
+                                               IO_PARALLEL)
+
+        if use_cuda_pinned_tensor:
+            aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer)
+        else:
+            aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, h)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        write_status = h.sync_pwrite(aio_buffer, aio_file)
+        assert write_status == 1
+
+        if not use_cuda_pinned_tensor:
+            h.free_cpu_locked_tensor(aio_buffer)
+
+        assert os.path.isfile(aio_file)
+
+        filecmp.clear_cache()
+        assert filecmp.cmp(ref_file, aio_file, shallow=False)
+
+    @pytest.mark.parametrize("cuda_device", [True, False])
+    def test_async_write(self,
+                         tmpdir,
+                         use_cuda_pinned_tensor,
+                         single_submit,
+                         overlap_events,
+                         cuda_device):
+        _skip_for_invalid_environment(use_cuda_device=cuda_device,
+                                      use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+
+        ref_file, ref_buffer = _do_ref_write(tmpdir)
+
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
+                                               QUEUE_DEPTH,
+                                               single_submit,
+                                               overlap_events,
+                                               IO_PARALLEL)
+        use_cpu_locked_tensor = False
+        if cuda_device:
+            aio_file, aio_buffer = _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer)
+        elif use_cuda_pinned_tensor:
+            aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer)
+        else:
+            aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, h)
+            use_cpu_locked_tensor = True
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        write_status = h.async_pwrite(aio_buffer, aio_file)
+        assert write_status == 0
+
+        wait_status = h.wait()
+        assert wait_status == 1
+
+        if use_cpu_locked_tensor:
+            h.free_cpu_locked_tensor(aio_buffer)
+
+        assert os.path.isfile(aio_file)
+
+        filecmp.clear_cache()
+        assert filecmp.cmp(ref_file, aio_file, shallow=False)
+
+
+@pytest.mark.sequential
+@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
+@pytest.mark.parametrize("cuda_device", [True, False])
+class TestAsyncQueue(DistributedTest):
+    world_size = 1
+    requires_cuda_env = False
+    if not get_accelerator().is_available():
+        init_distributed = False
+        set_dist_env = False
+
+    @pytest.mark.parametrize("async_queue", [2, 3])
+    def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device):
+        _skip_for_invalid_environment(use_cuda_device=cuda_device,
+                                      use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+
+        ref_files = []
+        for i in range(async_queue):
+            f, _ = _do_ref_write(tmpdir, i)
+            ref_files.append(f)
+
+        single_submit = True
+        overlap_events = True
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
+                                               QUEUE_DEPTH,
+                                               single_submit,
+                                               overlap_events,
+                                               IO_PARALLEL)
+
+        use_cpu_locked_tensor = False
+        if cuda_device:
+            aio_buffers = [
+                torch.empty(IO_SIZE,
+                            dtype=torch.uint8,
+                            device=get_accelerator().device_name())
+                for _ in range(async_queue)
+            ]
+        elif use_cuda_pinned_tensor:
+            aio_buffers = [
+                get_accelerator().pin_memory(
+                    torch.empty(IO_SIZE,
+                                dtype=torch.uint8,
+                                device='cpu')) for _ in range(async_queue)
+            ]
+        else:
+            tmp_tensor = torch.empty(0, dtype=torch.uint8)
+            aio_buffers = [
+                h.new_cpu_locked_tensor(IO_SIZE,
+                                        tmp_tensor) for _ in range(async_queue)
+            ]
+            use_cpu_locked_tensor = True
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        for i in range(async_queue):
+            read_status = h.async_pread(aio_buffers[i], ref_files[i])
+            assert read_status == 0
+
+        wait_status = h.wait()
+        assert wait_status == async_queue
+
+        for i in range(async_queue):
+            with open(ref_files[i], 'rb') as f:
+                ref_buffer = list(f.read())
+            assert ref_buffer == aio_buffers[i].tolist()
+
+        if use_cpu_locked_tensor:
+            for t in aio_buffers:
+                h.free_cpu_locked_tensor(t)
+
+    @pytest.mark.parametrize("async_queue", [2, 3])
+    def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device):
+        _skip_for_invalid_environment(use_cuda_device=cuda_device,
+                                      use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+
+        ref_files = []
+        ref_buffers = []
+        for i in range(async_queue):
+            f, buf = _do_ref_write(tmpdir, i)
+            ref_files.append(f)
+            ref_buffers.append(buf)
+
+        single_submit = True
+        overlap_events = True
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
+                                               QUEUE_DEPTH,
+                                               single_submit,
+                                               overlap_events,
+                                               IO_PARALLEL)
+
+        aio_files = []
+        aio_buffers = []
+        for i in range(async_queue):
+            if cuda_device:
+                f, buf = _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffers[i], i)
+            elif use_cuda_pinned_tensor:
+                f, buf = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffers[i], None, i)
+            else:
+                f, buf = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffers[i], h, i)
+            aio_files.append(f)
+            aio_buffers.append(buf)
+
+        use_cpu_locked_tensor = not (cuda_device or use_cuda_pinned_tensor)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        for i in range(async_queue):
+            read_status = h.async_pwrite(aio_buffers[i], aio_files[i])
+            assert read_status == 0
+
+        wait_status = h.wait()
+        assert wait_status == async_queue
+
+        if use_cpu_locked_tensor:
+            for t in aio_buffers:
+                h.free_cpu_locked_tensor(t)
+
+        for i in range(async_queue):
+            assert os.path.isfile(aio_files[i])
+
+            filecmp.clear_cache()
+            assert filecmp.cmp(ref_files[i], aio_files[i], shallow=False)
diff --git a/tests/unit/ops/quantizer/test_dequantize.py b/tests/unit/ops/quantizer/test_dequantize.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dc2f7d68a7044ee3283de15ec7dae7eba700767
--- /dev/null
+++ b/tests/unit/ops/quantizer/test_dequantize.py
@@ -0,0 +1,96 @@
+"""
+Copyright 2022 The Microsoft DeepSpeed Team
+"""
+
+import pytest
+import torch
+from deepspeed.ops import op_builder
+from deepspeed.accelerator import get_accelerator
+
+quantize_module = None
+
+
+def int4x2to2xint4(int4X2tensor):
+    high = int4X2tensor >> 4
+    low = (int4X2tensor << 4) >> 4
+    return torch.stack((high, low), dim=-1).flatten()
+
+
+def run_quantize(data, num_groups, q_bits, is_symmetric_quant):
+    global quantize_module
+    if quantize_module is None:
+        quantize_module = op_builder.QuantizerBuilder().load()
+
+    return quantize_module.quantize(
+        data,
+        num_groups,
+        q_bits,
+        quantize_module.Symmetric if is_symmetric_quant else quantize_module.Asymmetric)
+
+
+def run_dequantize(quantized_data, params, num_groups, q_bits, is_symmetric_quant):
+    global quantize_module
+    if quantize_module is None:
+        quantize_module = op_builder.QuantizerBuilder().load()
+
+    return quantize_module.dequantize(
+        quantized_data,
+        params,
+        num_groups,
+        q_bits,
+        quantize_module.Symmetric if is_symmetric_quant else quantize_module.Asymmetric)
+
+
+def run_ref_dequantize(quantized_data, params, num_groups, q_bits, is_symmetric_quant):
+
+    if (q_bits == 4):
+        quantized_data = int4x2to2xint4(quantized_data)
+
+    quantized_data = quantized_data.reshape(num_groups, -1).to(torch.float32)
+
+    if is_symmetric_quant:
+        return (quantized_data * params).to(torch.float16)
+    else:
+        scales = params[:, 0].reshape(-1, 1)
+        offsets = params[:, 1].reshape(-1, 1)
+        return (quantized_data * scales + offsets).to(torch.float16)
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("num_groups", [1, 13, 512])
+@pytest.mark.parametrize("num_elems",
+                         [8,
+                          16,
+                          32,
+                          64,
+                          128,
+                          256,
+                          4096,
+                          8192,
+                          12288,
+                          16384])
+@pytest.mark.parametrize("is_symmetric_quant", [True, False])
+@pytest.mark.parametrize("q_bits", [4, 8])
+def test_dequantize(num_elems, num_groups, is_symmetric_quant, q_bits):
+
+    activations = torch.randn((num_groups,
+                               num_elems),
+                              dtype=torch.float16,
+                              device=get_accelerator().device_name())
+    quantized_data, params = run_quantize(activations, num_groups, q_bits, is_symmetric_quant)
+
+    ds_dequant = run_dequantize(quantized_data,
+                                params,
+                                num_groups,
+                                q_bits,
+                                is_symmetric_quant)
+    ref_dequant = run_ref_dequantize(quantized_data,
+                                     params,
+                                     num_groups,
+                                     q_bits,
+                                     is_symmetric_quant)
+
+    assert (torch.allclose(ds_dequant.flatten(),
+                           ref_dequant.flatten(),
+                           rtol=3e-2,
+                           atol=2e-3))
diff --git a/tests/unit/ops/quantizer/test_fake_quantization.py b/tests/unit/ops/quantizer/test_fake_quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5304f7694eeee5de4dcae5b0ff56e59f114836e
--- /dev/null
+++ b/tests/unit/ops/quantizer/test_fake_quantization.py
@@ -0,0 +1,64 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import pytest
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops import op_builder
+
+quantizer_cuda_module = None
+
+
+def allclose(x, y):
+    assert x.dtype == y.dtype
+    rtol, atol = {torch.float32: (2e-2, 5e-3), torch.float16: (2e-2, 5e-3)}[x.dtype]
+    return torch.allclose(x, y, rtol=rtol, atol=atol)
+
+
+def quantize_dequantize_ref(inputs, bit, num_groups=1):
+    # quantize
+    q_range = 2**bit
+    input_flat = inputs.float().reshape(num_groups, -1).contiguous()
+    input_flat = torch.nan_to_num(input_flat, nan=0.0)
+    input_min = input_flat.amin(-1, keepdim=True)
+    input_max = input_flat.amax(-1, keepdim=True)
+
+    scale = q_range / (2 * torch.max(input_min.abs(), input_max.abs() + 1e-5))
+    input_flat = (input_flat * scale).round().clamp(-q_range // 2, q_range // 2 - 1)
+    # dequantize
+    dequant_flat = torch.t(input_flat.to(torch.int8)) / scale.view(-1).to(torch.float16)
+    return torch.t(dequant_flat).reshape(inputs.shape)
+
+
+def run_quant_dequant(inputs, groups, bits):
+    global quantizer_cuda_module
+
+    if quantizer_cuda_module is None:
+        quantizer_cuda_module = op_builder.QuantizerBuilder().load()
+    return quantizer_cuda_module.ds_quantize_fp16(inputs, groups, bits)
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("tensor_shape", [(16, 4096), (128, 256)])
+# Test with two tensor shapes as (16, 4096) and (128, 256).
+@pytest.mark.parametrize("groups", [1, 16])
+# Test with number of quant groups as 1 and 16.
+# Note that we have an explicit boundary for groups as ((size / groups) - 1) / 4096 + 1) <= MAX_REG.
+def test_fake_quant_dequant(tensor_shape, groups):
+
+    input_tensor = torch.rand((tensor_shape),
+                              dtype=torch.float16).to(get_accelerator().device_name())
+
+    # 8-bit quantization.
+    ref_input_8bit = input_tensor.clone().detach()
+    ds_input_8bit = input_tensor.clone().detach()
+    ref_out_8bit = quantize_dequantize_ref(ref_input_8bit, 8, groups)
+    # run_quant_dequant will do quantize then dequantize, and return the dequantized value.
+    ds_out_8bit = run_quant_dequant(ds_input_8bit, groups, 8)
+    assert (allclose(ds_out_8bit, ref_out_8bit))
+
+    # 4-bit quantization.
+    ref_input_4bit = input_tensor.clone().detach()
+    ds_input_4bit = input_tensor.clone().detach()
+    ref_out_4bit = quantize_dequantize_ref(ref_input_4bit, 4, groups)
+    ds_out_4bit = run_quant_dequant(ds_input_4bit, groups, 4)
+    assert (allclose(ds_out_4bit, ref_out_4bit))
diff --git a/tests/unit/ops/quantizer/test_quantize.py b/tests/unit/ops/quantizer/test_quantize.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cfd812e63f97832a18893b0548788287ff521e0
--- /dev/null
+++ b/tests/unit/ops/quantizer/test_quantize.py
@@ -0,0 +1,162 @@
+"""
+Copyright 2022 The Microsoft DeepSpeed Team
+"""
+
+import pytest
+import torch
+from deepspeed.ops import op_builder
+from deepspeed.accelerator import get_accelerator
+
+inference_module = None
+
+
+def run_quantize_ds(activations, num_groups, q_bits, is_symmetric_quant):
+    global inference_module
+    if inference_module is None:
+        inference_module = op_builder.QuantizerBuilder().load()
+
+    return inference_module.quantize(
+        activations,
+        num_groups,
+        q_bits,
+        inference_module.Symmetric
+        if is_symmetric_quant else inference_module.Asymmetric)
+
+
+def get_q_props(q_bits):
+    q_range = 2**q_bits
+    q_min = -(2**(q_bits - 1))
+    q_max = (2**(q_bits - 1) - 1)
+
+    q_min = torch.IntTensor([q_min]).to(device=get_accelerator().device_name())
+    q_max = torch.IntTensor([q_max]).to(device=get_accelerator().device_name())
+    return q_range, q_max, q_min
+
+
+def get_scale_zero_point(q_bits,
+                         is_symmetric_quant,
+                         max,
+                         min,
+                         absmax,
+                         scales=None,
+                         zero_points=None):
+
+    q_range, q_max, q_min = get_q_props(q_bits)
+
+    if is_symmetric_quant:
+        scale = torch.empty_like(absmax)
+        for i, x in enumerate(absmax):
+            scale[i] = torch.ones_like(x) if x == 0 else q_range / (2 * x)
+        zero_point = torch.zeros(scale.shape,
+                                 dtype=torch.float32,
+                                 device=get_accelerator().device_name())
+    else:
+        scale = torch.empty_like(max)
+        for i, x in enumerate(max):
+            scale[i] = torch.ones_like(x) if max[i] == min[i] else q_range / (max[i] -
+                                                                              min[i])
+        zero_point = q_min - (min * scale)
+
+    return scale, zero_point
+
+
+def int4x2to2xint4(int4X2tensor):
+    high = int4X2tensor >> 4
+    low = (int4X2tensor << 4) >> 4
+    return torch.stack((high, low), dim=-1).flatten()
+
+
+def run_float_quantize(q_bits, is_symmetric_quant, activations_ref, num_groups):
+
+    # Reference implementation
+    # https://pytorch.org/docs/stable/quantization-support.html
+
+    activations_ref = activations_ref.reshape(num_groups, -1).to(dtype=torch.float32)
+
+    max_abs_activations_ref = torch.amax(torch.abs(activations_ref),
+                                         dim=-1).view(num_groups,
+                                                      -1)
+    max_activations_ref = torch.amax(activations_ref, dim=-1).view(num_groups, -1)
+    min_activations_ref = torch.amin(activations_ref, dim=-1).view(num_groups, -1)
+
+    _, q_max, q_min = get_q_props(q_bits)
+
+    scale, zero_point = get_scale_zero_point(q_bits, is_symmetric_quant, max_activations_ref, min_activations_ref, max_abs_activations_ref)
+
+    data_f = activations_ref * scale
+
+    if not is_symmetric_quant:
+        data_f = data_f + zero_point
+
+    data_i32 = torch.round(data_f).to(dtype=torch.int32)
+
+    data_i32 = torch.minimum(torch.maximum(data_i32,
+                                           q_min.expand_as(data_i32)),
+                             q_max.expand_as(data_i32))
+    data_i8 = data_i32.to(dtype=torch.int8)
+
+    scales = (1.0 / scale).reshape(-1, 1)
+    offsets = zero_point.reshape(-1, 1)
+    params = torch.cat((scales, offsets), dim=-1)
+
+    return data_i8, params
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("num_groups", [1, 13, 512])
+@pytest.mark.parametrize("num_elems",
+                         [8,
+                          16,
+                          32,
+                          64,
+                          128,
+                          256,
+                          4096,
+                          8192,
+                          12288,
+                          16384])
+@pytest.mark.parametrize("is_symmetric_quant", [True, False])
+@pytest.mark.parametrize("q_bits", [4, 8])
+@pytest.mark.parametrize("directed_case", ["all_zeros", None])
+def test_float_quantize(num_elems,
+                        num_groups,
+                        is_symmetric_quant,
+                        q_bits,
+                        directed_case):
+
+    if directed_case == "all_zeros":
+        activations_ds = torch.zeros((num_groups,
+                                      num_elems),
+                                     dtype=torch.float16,
+                                     device=get_accelerator().device_name())
+    else:
+        activations_ds = torch.randn((num_groups,
+                                      num_elems),
+                                     dtype=torch.float16,
+                                     device=get_accelerator().device_name())
+    activations_ref = activations_ds.clone().detach()
+
+    ref_out_tensor, ref_params = run_float_quantize(q_bits, is_symmetric_quant, activations_ref, num_groups)
+
+    ds_out_tensor, ds_out_params = run_quantize_ds(activations_ds, num_groups, q_bits, is_symmetric_quant)
+
+    if (q_bits == 4):
+        ds_out_tensor = int4x2to2xint4(ds_out_tensor)
+
+    # Allow a max difference of 1 to account for differences in rounding in pytorch implementation
+    assert (torch.all(
+        torch.lt(torch.abs(ds_out_tensor.flatten() - ref_out_tensor.flatten()),
+                 2)))
+    if is_symmetric_quant:
+        assert (torch.allclose(ds_out_params.flatten(), ref_params[:, 0].flatten()))
+    else:
+        assert (torch.allclose(ds_out_params[:,
+                                             0].flatten(),
+                               ref_params[:,
+                                          0].flatten()))
+        assert (torch.allclose(ds_out_params[:,
+                                             1].flatten(),
+                               ref_params[:,
+                                          1].flatten(),
+                               atol=5e-5,
+                               rtol=5e-5))
diff --git a/tests/unit/ops/sparse_attention/test_sparse_attention.py b/tests/unit/ops/sparse_attention/test_sparse_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..bab57ccdc181609b85275e37a39f4b04555f8ab0
--- /dev/null
+++ b/tests/unit/ops/sparse_attention/test_sparse_attention.py
@@ -0,0 +1,271 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+# DeepSpeed note, some parts of code taken & adapted from commit c368a9fd1b2c9dee4cc94de9a6bb0be3d447be41
+# https://github.com/ptillet/torch-blocksparse/blob/master/tests/test_softmax.py
+# https://github.com/ptillet/torch-blocksparse/blob/master/tests/test_matmul.py
+# https://github.com/ptillet/torch-blocksparse/blob/master/tests/utils
+
+import pytest
+import torch
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import SparseAttnBuilder
+
+if not deepspeed.ops.__compatible_ops__[SparseAttnBuilder.NAME]:
+    pytest.skip("sparse attention op is not compatible on this system",
+                allow_module_level=True)
+
+
+def dense_to_sparse(w, mask, block):
+    """Converts dense matrix with explicit zeros to sparse matrix
+    """
+    Z = w.size(0)
+    ret = torch.empty((Z, mask.sum(), block, block), dtype=w.dtype, device=w.device)
+    nnz = mask.nonzero()
+    h, i, j = nnz[:, 0], nnz[:, 1], nnz[:, 2]
+    for zz in range(Z):
+        for idx, (hh, ii, jj) in enumerate(zip(h, i, j)):
+            ret[zz, idx, :, :] = w[zz, hh, ii*block: (ii+1)*block, jj*block: (jj+1)*block]
+    return ret
+
+
+def sparse_to_dense(w, mask, block, zero=0):
+    """Converts sparse matrix to dense matrix with explicit zeros
+    """
+    maskedw = w.clone()
+    for bz, wz in enumerate(range(0, w.size(0))):
+        for bh, wh in enumerate(range(0, w.size(1))):
+            for bi, wi in enumerate(range(0, w.size(2), block)):
+                for bj, wj in enumerate(range(0, w.size(3), block)):
+                    if mask[bh, bi, bj] == 0:
+                        maskedw[wz, wh, wi:wi + block, wj:wj + block] = zero
+                    #maskedw[wz, wh, wi : wi+block, wj : wj+block] *= mask[bh, bi, bj]
+    return maskedw
+
+
+def allclose(x, y):
+    assert x.dtype == y.dtype
+    rtol, atol = {torch.float32: (5e-4, 5e-5), torch.float16: (3e-2, 2e-3)}[x.dtype]
+    return torch.allclose(x, y, rtol=rtol, atol=atol)
+
+
+def make_layout(rho, shape):
+    probs = torch.Tensor([rho, 1 - rho])
+    generator = torch.distributions.categorical.Categorical(probs)
+    layout = generator.sample(shape)
+    return layout
+
+
+def run_softmax_reference(x, scale, dx, kp_mask, attn_mask, layout, block):
+    x = sparse_to_dense(x, layout, block, zero=float('-inf'))
+    x.retain_grad()
+    if kp_mask is not None:
+        bcattn_mask = attn_mask[None, None, :, :] + torch.zeros_like(x)
+        x[bcattn_mask == 0] = float('-inf')
+        y = torch.softmax(x * scale + kp_mask[:, None, None, :], -1)
+    else:
+        y = torch.softmax(x * scale, -1)
+    y.backward(dx)
+    dx = x.grad.clone()
+    dx = dense_to_sparse(dx, layout, block)
+    y = dense_to_sparse(y, layout, block)
+    return y, dx
+
+
+def run_softmax_sparse(x, scale, dx, kp_mask, attn_mask, layout, block):
+    from deepspeed.ops.sparse_attention.softmax import Softmax
+    sparse_softmax = Softmax(layout, block, bench=False)
+
+    dx = dense_to_sparse(dx, layout, block)
+    x = dense_to_sparse(x, layout, block)
+    x.retain_grad()
+    y = sparse_softmax(x,
+                       scale=scale,
+                       key_padding_mask=kp_mask,
+                       key_padding_mask_mode='add',
+                       attn_mask=attn_mask,
+                       attn_mask_mode='mul')
+    y.backward(dx)
+    dx = x.grad.clone()
+    x.grad.zero_()
+    return x, dx
+
+
+def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layout=None):
+    if layout is None:
+        layout = make_layout(rho, (H, M // block, N // block))
+    if dense_x:
+        x = torch.rand((Z,
+                        H,
+                        M,
+                        N),
+                       dtype=dtype,
+                       requires_grad=True,
+                       device=get_accelerator().device_name())
+    else:
+        x = torch.rand((Z,
+                        layout.sum(),
+                        block,
+                        block),
+                       dtype=dtype,
+                       requires_grad=True,
+                       device=get_accelerator().device_name())
+    dx = torch.rand_like(x)
+    bool_attn_mask = torch.randint(low=0,
+                                   high=2,
+                                   size=(N,
+                                         N),
+                                   dtype=torch.bool,
+                                   requires_grad=False,
+                                   device=get_accelerator().device_name())
+    fp_attn_mask = bool_attn_mask.type(dtype)
+    kp_mask = torch.randint(low=0,
+                            high=2,
+                            size=(Z,
+                                  N),
+                            dtype=dtype,
+                            requires_grad=False,
+                            device=get_accelerator().device_name())
+    kp_mask[kp_mask == 1.] = float('-inf')
+    return layout, x, dx, bool_attn_mask, fp_attn_mask, kp_mask
+
+
+def _skip_on_cuda_compatability():
+    if deepspeed.accelerator.get_accelerator().device_name() == 'cuda':
+        if torch.cuda.get_device_capability()[0] < 7:
+            pytest.skip("needs higher compute capability than 7")
+        cuda_major = int(torch.version.cuda.split('.')[0]) * 10
+        cuda_minor = int(torch.version.cuda.split('.')[1])
+        cuda_version = cuda_major + cuda_minor
+        if (cuda_version != 101 and cuda_version != 102) and \
+                (cuda_version != 111 and cuda_version != 110):
+            pytest.skip("requires cuda 10.1 or 10.2 or 11.0 or 11.1")
+    else:
+        assert deepspeed.accelerator.get_accelerator().device_name() == 'xpu'
+        return
+
+
+@pytest.mark.parametrize("block", [16, 32])
+@pytest.mark.parametrize("width", [256, 576])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+def test_softmax(block, width, dtype):
+    #_skip_on_cuda_compatability()
+    Z = 2
+    H = 4
+    scale = 0.4
+    rho = 0.4
+    M = N = width
+    layout, x, dx, bool_attn_mask, fp_attn_mask, kp_mask = init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, layout=None)
+    ref_y, ref_dx = run_softmax_reference(x, scale, dx, kp_mask, bool_attn_mask, layout, block)
+    st_y, st_dx = run_softmax_sparse(x, scale, dx, kp_mask, fp_attn_mask, layout, block)
+
+    assert allclose(ref_y, st_y)
+    assert allclose(ref_dx, st_dx)
+
+
+def run_matmul_reference(x, w, mode, trans_a, trans_b, layout, block, dy):
+    x = sparse_to_dense(x, layout, block) if mode == 'dsd' else x
+    w = sparse_to_dense(w, layout, block) if mode == 'dds' else w
+    x.retain_grad()
+    w.retain_grad()
+    xx = x.transpose(2, 3) if trans_a else x
+    ww = w.transpose(2, 3) if trans_b else w
+    y = torch.matmul(xx, ww)
+    y = sparse_to_dense(y, layout, block) if mode == 'sdd' else y
+    y.backward(dy)
+    dx = x.grad.clone()
+    dw = w.grad.clone()
+    x.grad.zero_()
+    w.grad.zero_()
+    y = dense_to_sparse(y, layout, block) if mode == 'sdd' else y
+    dx = dense_to_sparse(dx, layout, block) if mode == 'dsd' else dx
+    dw = dense_to_sparse(dw, layout, block) if mode == 'dds' else dw
+    return y, dx, dw
+
+
+def run_matmul_sparse(x, w, mode, trans_a, trans_b, layout, block, dy):
+    from deepspeed.ops.sparse_attention.matmul import MatMul
+    x = dense_to_sparse(x, layout, block) if mode == 'dsd' else x
+    w = dense_to_sparse(w, layout, block) if mode == 'dds' else w
+    dy = dense_to_sparse(dy, layout, block) if mode == 'sdd' else dy
+    op = MatMul(layout, block, mode, trans_a=trans_a, trans_b=trans_b)
+    x.retain_grad()
+    w.retain_grad()
+    y = op(x, w)
+    y.backward(dy)
+    dx = x.grad.clone()
+    dw = w.grad.clone()
+    x.grad.zero_()
+    return y, dx, dw
+
+
+def init_matmul_inputs(Z, H, M, N, K, rho, mode, trans_a, trans_b, block, dtype, layout):
+    torch.manual_seed(1)
+    AS0 = K if trans_a else M
+    AS1 = M if trans_a else K
+    BS0 = N if trans_b else K
+    BS1 = K if trans_b else N
+    shape = {'sdd': (M, N), 'dsd': (AS0, AS1), 'dds': (BS0, BS1)}[mode]
+    x = torch.rand((Z,
+                    H,
+                    AS0,
+                    AS1),
+                   dtype=dtype,
+                   requires_grad=True,
+                   device=get_accelerator().device_name())
+    w = torch.rand((Z,
+                    H,
+                    BS0,
+                    BS1),
+                   dtype=dtype,
+                   requires_grad=True,
+                   device=get_accelerator().device_name())
+    dy = torch.rand((Z, H, M, N), dtype=dtype, device=get_accelerator().device_name())
+    if layout is None:
+        layout = make_layout(rho, (H, shape[0] // block, shape[1] // block))
+    else:
+        assert list(layout.shape) == [H, shape[0] // block, shape[1] // block]
+    x.retain_grad()
+    w.retain_grad()
+    return x, w, dy, shape, layout
+
+testdata = [
+      (16, dtype, mode, trans_a, trans_b)\
+         for dtype in [torch.float16]\
+         for mode in ['sdd', 'dds']\
+         for trans_a   in [False]\
+         for trans_b   in [False, True]\
+   ] + [
+      (16, dtype, mode, trans_a, trans_b)\
+         for dtype in [torch.float16]\
+         for mode in ['dsd']\
+         for trans_a   in [False, True]\
+         for trans_b   in [False]\
+   ] + [
+      (16, dtype, mode, trans_a, trans_b)\
+         for dtype in [torch.float32]\
+         for mode in ['sdd', 'dsd', 'dds']\
+         for trans_a   in [False]\
+         for trans_b   in [False]\
+   ] + [
+      (block, torch.float16, mode, False, False)\
+         for block in [16, 32, 64]\
+         for mode in ['sdd', 'dsd', 'dds']\
+   ]
+
+
+@pytest.mark.parametrize("block, dtype, mode, trans_a, trans_b", testdata)
+def test_matmul(block, dtype, mode, trans_a, trans_b):
+    #_skip_on_cuda_compatability()
+    Z = 3
+    H = 2
+    M = 128
+    N = 256
+    K = 192
+    rho = 0.5
+    x, w, dy, shape, layout = init_matmul_inputs(Z, H, M, N, K, rho, mode, trans_a, trans_b, block, dtype, layout=None)
+    ref_y, ref_dx, ref_dw = run_matmul_reference(x.clone(), w.clone(), mode, trans_a, trans_b, layout, block, dy)
+    st_y, st_dx, st_dw = run_matmul_sparse(x.clone(), w.clone(), mode, trans_a, trans_b, layout, block, dy)
+    assert allclose(ref_y, st_y)
+    assert allclose(ref_dx, st_dx)
+    assert allclose(ref_dw, st_dw)
diff --git a/tests/unit/ops/spatial/test_nhwc_bias_add.py b/tests/unit/ops/spatial/test_nhwc_bias_add.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3a31cf47ba481a4e4b1a1a24980ffabe586b8b1
--- /dev/null
+++ b/tests/unit/ops/spatial/test_nhwc_bias_add.py
@@ -0,0 +1,136 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+import pytest
+import torch
+from deepspeed.ops.transformer.inference.bias_add import nhwc_bias_add
+from deepspeed.accelerator import get_accelerator
+
+
+def allclose(x, y):
+    assert x.dtype == y.dtype
+    rtol, atol = {torch.float32: (5e-3, 5e-4), torch.float16: (3e-2, 2e-3), torch.int8: (1, 1)}[x.dtype]
+    return torch.allclose(x, y, rtol=rtol, atol=atol)
+
+
+def ref_bias_add(activations, bias):
+    return activations + bias.reshape(1, -1, 1, 1)
+
+
+channels_list = [
+    192,
+    384,
+    320,
+    576,
+    640,
+    768,
+    960,
+    1152,
+    1280,
+    1536,
+    1600,
+    1920,
+    2240,
+    2560
+]
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("batch", [1, 2, 10])
+@pytest.mark.parametrize("image_size", [16, 32, 64])
+@pytest.mark.parametrize("channels", channels_list)
+def test_bias_add(batch, image_size, channels):
+    activations = torch.randn(
+        (batch,
+         channels,
+         image_size,
+         image_size),
+        dtype=torch.float16,
+        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
+    bias = torch.randn((channels),
+                       dtype=torch.float16,
+                       device=get_accelerator().device_name())
+
+    ref_vals = ref_bias_add(activations.clone().detach(), bias)
+    ds_vals = nhwc_bias_add(activations, bias)
+
+    assert allclose(ds_vals, ref_vals)
+
+
+def ref_bias_add_add(activations, bias, other):
+    return (activations + bias.reshape(1, -1, 1, 1)) + other
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("batch", [1, 2, 10])
+@pytest.mark.parametrize("image_size", [16, 32, 64])
+@pytest.mark.parametrize("channels", channels_list)
+def test_bias_add_add(batch, image_size, channels):
+    activations = torch.randn(
+        (batch,
+         channels,
+         image_size,
+         image_size),
+        dtype=torch.float16,
+        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
+    other = torch.randn(
+        (batch,
+         channels,
+         image_size,
+         image_size),
+        dtype=torch.float16,
+        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
+    bias = torch.randn((channels),
+                       dtype=torch.float16,
+                       device=get_accelerator().device_name())
+
+    ref_vals = ref_bias_add_add(activations.clone().detach(), bias, other)
+    ds_vals = nhwc_bias_add(activations, bias, other=other)
+
+    assert allclose(ds_vals, ref_vals)
+
+
+def ref_bias_add_bias_add(activations, bias, other, other_bias):
+    return (activations + bias.reshape(1,
+                                       -1,
+                                       1,
+                                       1)) + (other + other_bias.reshape(1,
+                                                                         -1,
+                                                                         1,
+                                                                         1))
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("batch", [1, 2, 10])
+@pytest.mark.parametrize("image_size", [16, 32, 64])
+@pytest.mark.parametrize("channels", channels_list)
+def test_bias_add_bias_add(batch, image_size, channels):
+    activations = torch.randn(
+        (batch,
+         channels,
+         image_size,
+         image_size),
+        dtype=torch.float16,
+        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
+    other = torch.randn(
+        (batch,
+         channels,
+         image_size,
+         image_size),
+        dtype=torch.float16,
+        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
+    bias = torch.randn((channels),
+                       dtype=torch.float16,
+                       device=get_accelerator().device_name())
+    other_bias = torch.randn((channels),
+                             dtype=torch.float16,
+                             device=get_accelerator().device_name())
+
+    ref_vals = ref_bias_add_bias_add(activations.clone().detach(),
+                                     bias,
+                                     other,
+                                     other_bias)
+    ds_vals = nhwc_bias_add(activations, bias, other=other, other_bias=other_bias)
+
+    assert allclose(ds_vals, ref_vals)
diff --git a/tests/unit/ops/transformer/inference/test_bias_add.py b/tests/unit/ops/transformer/inference/test_bias_add.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3475a14766d01c2a94576a817d0c65b94970e70
--- /dev/null
+++ b/tests/unit/ops/transformer/inference/test_bias_add.py
@@ -0,0 +1,57 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import pytest
+import torch
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import InferenceBuilder
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("Inference ops are not available on this system",
+                allow_module_level=True)
+
+inference_module = None
+torch_minor_version = None
+
+
+def allclose(x, y):
+    assert x.dtype == y.dtype
+    rtol, atol = {torch.float32: (5e-4, 5e-5), torch.float16: (3e-2, 2e-3)}[x.dtype]
+    return torch.allclose(x, y, rtol=rtol, atol=atol)
+
+
+def run_bias_add_reference(activations, bias):
+    return activations + bias
+
+
+def run_bias_add_ds(activations, bias):
+    global inference_module
+    if inference_module is None:
+        inference_module = InferenceBuilder().load()
+    if activations.dtype == torch.float16:
+        return inference_module.bias_add_fp16(activations, bias)
+    else:
+        return inference_module.bias_add_fp32(activations, bias)
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("batch", [1, 2])
+@pytest.mark.parametrize("sequence", [1, 128, 255])
+@pytest.mark.parametrize("channels", [512, 1232, 4096])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"])
+def test_bias_add(batch, sequence, channels, dtype):
+    activations_ds = torch.randn((batch,
+                                  sequence,
+                                  channels),
+                                 dtype=dtype,
+                                 device=get_accelerator().device_name())
+    bias_ds = torch.randn((channels),
+                          dtype=dtype,
+                          device=get_accelerator().device_name())
+
+    activations_ref = activations_ds.clone().detach()
+    bias_ref = bias_ds.clone().detach()
+
+    ds_out = run_bias_add_ds(activations_ds, bias_ds)
+    ref_out = run_bias_add_reference(activations_ref, bias_ref)
+    assert allclose(ds_out, ref_out)
diff --git a/tests/unit/ops/transformer/inference/test_bias_geglu.py b/tests/unit/ops/transformer/inference/test_bias_geglu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c70974e51d94a9cb22a705b90f2df8d0f8b9434e
--- /dev/null
+++ b/tests/unit/ops/transformer/inference/test_bias_geglu.py
@@ -0,0 +1,58 @@
+"""
+Copyright 2022 The Microsoft DeepSpeed Team
+"""
+
+import pytest
+import torch
+import deepspeed
+from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.accelerator import get_accelerator
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("Inference ops are not available on this system",
+                allow_module_level=True)
+
+inference_module = None
+torch_minor_version = None
+
+
+def allclose(x, y):
+    assert x.dtype == y.dtype
+    rtol, atol = {torch.float32: (5e-3, 5e-4), torch.float16: (3e-2, 2e-3), torch.int8: (0, 0)}[x.dtype]
+    return torch.allclose(x, y, rtol=rtol, atol=atol)
+
+
+def run_bias_geglu_reference(activations, bias):
+    # Expected behavior is that of casting to float32 internally
+    # Explicitly using the default GeLU
+    activations = activations + bias.reshape(1, 1, -1)
+    hidden_states, gate = activations.chunk(2, dim=-1)
+    return hidden_states * torch.nn.functional.gelu(gate.to(torch.float32)).to(
+        activations.dtype)
+
+
+def run_bias_geglu_ds(activation, bias):
+    global inference_module
+    if inference_module is None:
+        inference_module = InferenceBuilder().load()
+    return inference_module.bias_geglu(activation, bias)
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("batch", [1, 2])
+@pytest.mark.parametrize("sequence", [1, 128, 255])
+@pytest.mark.parametrize("channels", [512, 1232, 4096])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+def test_bias_geglu(batch, sequence, channels, dtype):
+    activation = torch.randn((batch,
+                              sequence,
+                              channels * 2),
+                             dtype=dtype,
+                             device=get_accelerator().device_name())
+    bias = torch.randn((channels * 2),
+                       dtype=dtype,
+                       device=get_accelerator().device_name())
+
+    ds_out = run_bias_geglu_ds(activation, bias)
+    ref_out = run_bias_geglu_reference(activation, bias)
+    assert (allclose(ds_out, ref_out))
diff --git a/tests/unit/ops/transformer/inference/test_bias_gelu.py b/tests/unit/ops/transformer/inference/test_bias_gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c1762179ead362e7acc05f83682525e2f85e4e5
--- /dev/null
+++ b/tests/unit/ops/transformer/inference/test_bias_gelu.py
@@ -0,0 +1,66 @@
+"""
+Copyright 2022 The Microsoft DeepSpeed Team
+"""
+
+import pytest
+import torch
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import InferenceBuilder
+from packaging import version as pkg_version
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("Inference ops are not available on this system",
+                allow_module_level=True)
+
+inference_module = None
+torch_minor_version = None
+
+
+def allclose(x, y):
+    assert x.dtype == y.dtype
+    rtol, atol = {torch.float32: (5e-4, 5e-5), torch.float16: (3e-2, 2e-3)}[x.dtype]
+    return torch.allclose(x, y, rtol=rtol, atol=atol)
+
+
+def run_bias_gelu_reference(activations, bias):
+    # Expected behavior is that of casting to float32 internally and using the tanh approximation
+    return torch.nn.functional.gelu(activations.to(torch.float32) +
+                                    bias.to(torch.float32),
+                                    approximate='tanh').to(activations.dtype)
+
+
+def run_bias_gelu_ds(activations, bias):
+    global inference_module
+    if inference_module is None:
+        inference_module = InferenceBuilder().load()
+    if activations.dtype == torch.float16:
+        return inference_module.bias_gelu_fp16(activations, bias)
+    else:
+        return inference_module.bias_gelu_fp32(activations, bias)
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("batch", [1, 2])
+@pytest.mark.parametrize("sequence", [1, 128, 255])
+@pytest.mark.parametrize("channels", [512, 1232, 4096])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+def test_bias_gelu(batch, sequence, channels, dtype):
+    if pkg_version.parse(torch.__version__) < pkg_version.parse("1.12"):
+        pytest.skip("gelu implementation matches only after torch 1.12")
+
+    activations_ds = torch.randn((batch,
+                                  sequence,
+                                  channels),
+                                 dtype=dtype,
+                                 device=get_accelerator().device_name())
+    bias_ds = torch.randn((channels),
+                          dtype=dtype,
+                          device=get_accelerator().device_name())
+
+    activations_ref = activations_ds.clone().detach()
+    bias_ref = bias_ds.clone().detach()
+
+    ds_out = run_bias_gelu_ds(activations_ds, bias_ds)
+    ref_out = run_bias_gelu_reference(activations_ref, bias_ref)
+    assert (allclose(ds_out, ref_out))
diff --git a/tests/unit/ops/transformer/inference/test_bias_relu.py b/tests/unit/ops/transformer/inference/test_bias_relu.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b66f6bd2e18da03c2512933bb2d08ee9daf9ad
--- /dev/null
+++ b/tests/unit/ops/transformer/inference/test_bias_relu.py
@@ -0,0 +1,61 @@
+"""
+Copyright 2022 The Microsoft DeepSpeed Team
+"""
+
+import pytest
+import torch
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import InferenceBuilder
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("Inference ops are not available on this system",
+                allow_module_level=True)
+
+inference_module = None
+torch_minor_version = None
+
+
+def allclose(x, y):
+    assert x.dtype == y.dtype
+    rtol, atol = {torch.float32: (5e-4, 5e-5), torch.float16: (3e-2, 2e-3)}[x.dtype]
+    return torch.allclose(x, y, rtol=rtol, atol=atol)
+
+
+def run_bias_relu_reference(activations, bias):
+    # Expected behavior is that of casting to float32 internally
+    return torch.nn.functional.relu(
+        activations.to(torch.float32) + bias.to(torch.float32)).to(activations.dtype)
+
+
+def run_bias_relu_ds(activations, bias):
+    global inference_module
+    if inference_module is None:
+        inference_module = InferenceBuilder().load()
+    if activations.dtype == torch.float16:
+        return inference_module.bias_relu_fp16(activations, bias)
+    else:
+        return inference_module.bias_relu_fp32(activations, bias)
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("batch", [1, 2])
+@pytest.mark.parametrize("sequence", [1, 128, 255])
+@pytest.mark.parametrize("channels", [512, 1232, 4096])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+def test_bias_relu(batch, sequence, channels, dtype):
+    activations_ds = torch.randn((batch,
+                                  sequence,
+                                  channels),
+                                 dtype=dtype,
+                                 device=get_accelerator().device_name())
+    bias_ds = torch.randn((channels),
+                          dtype=dtype,
+                          device=get_accelerator().device_name())
+
+    activations_ref = activations_ds.clone().detach()
+    bias_ref = bias_ds.clone().detach()
+
+    ds_out = run_bias_relu_ds(activations_ds, bias_ds)
+    ref_out = run_bias_relu_reference(activations_ref, bias_ref)
+    assert (allclose(ds_out, ref_out))
diff --git a/tests/unit/ops/transformer/inference/test_layer_norm.py b/tests/unit/ops/transformer/inference/test_layer_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..61f6455629e6b598e646b3f1fe908d1c34ccb8b8
--- /dev/null
+++ b/tests/unit/ops/transformer/inference/test_layer_norm.py
@@ -0,0 +1,202 @@
+"""
+Copyright 2022 The Microsoft DeepSpeed Team
+"""
+
+import deepspeed
+import torch
+import pytest
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import InferenceBuilder
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("Inference ops are not available on this system",
+                allow_module_level=True)
+
+inference_module = None
+
+
+def allclose(x, y):
+    assert x.dtype == y.dtype
+    rtol, atol = {torch.float32: (5e-4, 5e-5), torch.float16: (3e-2, 2e-3)}[x.dtype]
+    return torch.allclose(x, y, rtol=rtol, atol=atol)
+
+
+def ref_implementation(vals, gamma, beta, espilon, channels, dtype):
+    vals_f = vals.to(torch.float32)
+    gamma_f = gamma.to(torch.float32)
+    beta_f = beta.to(torch.float32)
+    return torch.nn.functional.layer_norm(vals_f,
+                                          (channels,
+                                           ),
+                                          weight=gamma_f,
+                                          bias=beta_f).to(dtype)
+
+
+def ds_implementation(vals, gamma, beta, epsilon):
+    global inference_module
+    if inference_module is None:
+        inference_module = InferenceBuilder().load()
+    return inference_module.layer_norm(vals, gamma, beta, epsilon)
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("batch", [1, 32])
+@pytest.mark.parametrize("seq_len", [1, 128])
+@pytest.mark.parametrize("channels", [384, 512, 768, 1024, 2048, 8192, 14432])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+def test_layer_norm(batch, seq_len, channels, dtype):
+    vals = torch.randn((batch,
+                        seq_len,
+                        channels),
+                       dtype=dtype,
+                       device=get_accelerator().current_device_name())
+    gamma = torch.randn((channels),
+                        dtype=dtype,
+                        device=get_accelerator().current_device_name())
+    beta = torch.rand((channels),
+                      dtype=dtype,
+                      device=get_accelerator().current_device_name())
+    epsilon = 1e-5
+
+    ref_output = ref_implementation(vals, gamma, beta, epsilon, channels, dtype)
+    new_output = ds_implementation(vals, gamma, beta, epsilon)
+
+    assert allclose(new_output, ref_output)
+
+
+def residual_ref_implementation(vals, bias, res, gamma, beta, espilon, channels, dtype):
+    vals_f = vals.to(torch.float32)
+    bias_f = bias.to(torch.float32).reshape(1, 1, -1)
+    res_f = res.to(torch.float32)
+    gamma_f = gamma.to(torch.float32)
+    beta_f = beta.to(torch.float32)
+    return torch.nn.functional.layer_norm(vals_f + bias_f + res_f,
+                                          (channels,
+                                           ),
+                                          weight=gamma_f,
+                                          bias=beta_f).to(dtype)
+
+
+def residual_ds_implementation(vals, bias, res, gamma, beta, epsilon):
+    global inference_module
+    if inference_module is None:
+        inference_module = InferenceBuilder().load()
+    return inference_module._layer_norm_residual(vals, bias, res, gamma, beta, epsilon)
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("batch", [1, 32])
+@pytest.mark.parametrize("seq_len", [1, 128])
+@pytest.mark.parametrize("channels", [384, 512, 768, 1024, 2048, 8192, 14432])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+def test_layer_norm_residual(batch, seq_len, channels, dtype):
+    vals = torch.randn((batch,
+                        seq_len,
+                        channels),
+                       dtype=dtype,
+                       device=get_accelerator().current_device_name())
+    residual = torch.randn((batch,
+                            seq_len,
+                            channels),
+                           dtype=dtype,
+                           device=get_accelerator().current_device_name())
+    bias = torch.randn((channels),
+                       dtype=dtype,
+                       device=get_accelerator().current_device_name())
+    gamma = torch.randn((channels),
+                        dtype=dtype,
+                        device=get_accelerator().current_device_name())
+    beta = torch.rand((channels),
+                      dtype=dtype,
+                      device=get_accelerator().current_device_name())
+    epsilon = 1e-5
+
+    new_output = residual_ds_implementation(vals, bias, residual, gamma, beta, epsilon)
+    ref_output = residual_ref_implementation(vals,
+                                             bias,
+                                             residual,
+                                             gamma,
+                                             beta,
+                                             epsilon,
+                                             channels,
+                                             dtype)
+
+    assert allclose(new_output, ref_output)
+
+
+def residual_store_ref_implementation(vals,
+                                      bias,
+                                      res,
+                                      gamma,
+                                      beta,
+                                      espilon,
+                                      channels,
+                                      dtype):
+    vals_f = vals.to(torch.float32)
+    bias_f = bias.to(torch.float32).reshape(1, 1, -1)
+    res_f = res.to(torch.float32)
+    gamma_f = gamma.to(torch.float32)
+    beta_f = beta.to(torch.float32)
+    res_output = vals_f + bias_f + res_f
+    norm_output = torch.nn.functional.layer_norm(res_output,
+                                                 (channels,
+                                                  ),
+                                                 weight=gamma_f,
+                                                 bias=beta_f).to(dtype)
+    return norm_output, res_output.to(dtype)
+
+
+def residual_store_ds_implementation(vals, bias, res, gamma, beta, epsilon):
+    global inference_module
+    if inference_module is None:
+        inference_module = InferenceBuilder().load()
+    return inference_module.layer_norm_residual_store_pre_ln_res(
+        vals,
+        bias,
+        res,
+        gamma,
+        beta,
+        epsilon)
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("batch", [1, 32])
+@pytest.mark.parametrize("seq_len", [1, 128])
+@pytest.mark.parametrize("channels", [384, 512, 768, 1024, 2048, 8192, 14432])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+def test_layer_norm_residual_store_pre_ln_res(batch, seq_len, channels, dtype):
+    vals = torch.randn((batch,
+                        seq_len,
+                        channels),
+                       dtype=dtype,
+                       device=get_accelerator().current_device_name())
+    residual = torch.randn((batch,
+                            seq_len,
+                            channels),
+                           dtype=dtype,
+                           device=get_accelerator().current_device_name())
+    bias = torch.randn((channels),
+                       dtype=dtype,
+                       device=get_accelerator().current_device_name())
+    gamma = torch.randn((channels),
+                        dtype=dtype,
+                        device=get_accelerator().current_device_name())
+    beta = torch.rand((channels),
+                      dtype=dtype,
+                      device=get_accelerator().current_device_name())
+    epsilon = 1e-5
+
+    # Need to run the reference first since there's an in-place component to ours
+    ref_norm_output, norm_res_output = residual_store_ref_implementation(vals,
+                                        bias,
+                                        residual,
+                                        gamma,
+                                        beta,
+                                        epsilon,
+                                        channels,
+                                        dtype)
+
+    ds_norm_output, ds_res_output = residual_store_ds_implementation(vals, bias, residual, gamma, beta, epsilon)
+
+    assert allclose(ds_res_output, norm_res_output)
+    assert allclose(ds_norm_output, ref_norm_output)
diff --git a/tests/unit/ops/transformer/inference/test_moe_res_matmult.py b/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdd6e8607c71e320577bef8ea3e215a97d6d68c7
--- /dev/null
+++ b/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
@@ -0,0 +1,69 @@
+"""
+Copyright 2022 The Microsoft DeepSpeed Team
+"""
+
+import pytest
+import torch
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import InferenceBuilder
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("Inference ops are not available on this system",
+                allow_module_level=True)
+
+inference_module = None
+
+
+def allclose(x, y):
+    assert x.dtype == y.dtype
+    rtol, atol = {torch.float32: (5e-4, 5e-5), torch.float16: (3e-2, 2e-3)}[x.dtype]
+    return torch.allclose(x, y, rtol=rtol, atol=atol)
+
+
+def run_moe_res_matmul_reference(residual, coef1, coef2, output):
+    return residual * coef1 + output * coef2
+
+
+def run_moe_res_matmul_ds(residual, coef, output):
+    global inference_module
+    if inference_module is None:
+        inference_module = InferenceBuilder().load()
+    coef_t = coef.transpose(-1, -2).contiguous()
+    return inference_module.moe_res_matmul(residual, coef_t, output)
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("hidden_dim", [16, 64])
+@pytest.mark.parametrize("c", [1, 4])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
+def test_moe_residual_matmul(hidden_dim, c, dtype):
+    residual_ds = torch.randn((c,
+                               hidden_dim * c,
+                               hidden_dim),
+                              dtype=dtype,
+                              device=get_accelerator().device_name())
+    coeff1 = torch.randn((1,
+                          1,
+                          hidden_dim),
+                         dtype=dtype,
+                         device=get_accelerator().device_name())
+    coeff2 = torch.randn((1,
+                          1,
+                          hidden_dim),
+                         dtype=dtype,
+                         device=get_accelerator().device_name())
+    out_ds = torch.randn((c,
+                          hidden_dim * c,
+                          hidden_dim),
+                         dtype=dtype,
+                         device=get_accelerator().device_name())
+    coeff_ds = torch.cat((coeff1, coeff2), dim=-1)
+    residual_ref = residual_ds.clone().detach()
+    coeff_ref = coeff_ds.clone().detach()
+    out_ref = out_ds.clone().detach()
+
+    ds_out = run_moe_res_matmul_ds(residual_ds, coeff_ds, out_ds)
+    ref_out = run_moe_res_matmul_reference(residual_ref, coeff1, coeff2, out_ref)
+
+    assert (allclose(ds_out, ref_out))
diff --git a/tests/unit/ops/transformer/inference/test_residual_add.py b/tests/unit/ops/transformer/inference/test_residual_add.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dacee355369cf66a0738a19fc28e4e644ea194e
--- /dev/null
+++ b/tests/unit/ops/transformer/inference/test_residual_add.py
@@ -0,0 +1,151 @@
+"""
+Copyright 2022 The Microsoft DeepSpeed Team
+"""
+
+import pytest
+import torch
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import InferenceBuilder
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("Inference ops are not available on this system",
+                allow_module_level=True)
+
+
+def allclose(x, y):
+    assert x.dtype == y.dtype
+    rtol, atol = {torch.float32: (5e-4, 5e-5), torch.float16: (3e-2, 2e-2)}[x.dtype]
+    return torch.allclose(x, y, rtol=rtol, atol=atol)
+
+
+@pytest.fixture(scope="module")
+def inference_module():
+    return InferenceBuilder().load()
+
+
+def res_add_bias_ref(hidden_state,
+                     residual,
+                     attn_output,
+                     attn_bias,
+                     final_bias,
+                     mp_size=1,
+                     pre_attn_norm=True):
+    if pre_attn_norm:
+        hidden_state += (residual + final_bias + attn_output + attn_bias) / mp_size
+    else:
+        hidden_state += residual + final_bias
+    return hidden_state
+
+
+def res_add_bias_ref_gptj(hidden_state,
+                          residual,
+                          attn_output,
+                          attn_bias,
+                          final_bias,
+                          add_attn_bias,
+                          mp_size):
+    hidden_state += attn_output + (residual + final_bias) / mp_size
+    if add_attn_bias:
+        hidden_state += attn_bias / mp_size
+    return hidden_state
+
+
+def run_residual_add_reference(hidden_state,
+                               residual,
+                               attn_output,
+                               attn_bias,
+                               final_bias,
+                               mlp_after_attn,
+                               add_attn_bias,
+                               mp_size,
+                               pre_attn_norm):
+    if mlp_after_attn:
+        return res_add_bias_ref(hidden_state,
+                                residual,
+                                attn_output,
+                                attn_bias,
+                                final_bias,
+                                mp_size,
+                                pre_attn_norm)
+    else:
+        return res_add_bias_ref_gptj(hidden_state,
+                                     residual,
+                                     attn_output,
+                                     attn_bias,
+                                     final_bias,
+                                     add_attn_bias,
+                                     mp_size)
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("batch", [1, 2])
+@pytest.mark.parametrize("sequence", [1, 128, 255])
+@pytest.mark.parametrize("hidden_dim", [512, 1232, 4096])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+@pytest.mark.parametrize("mlp_after_attn", [True, False])
+@pytest.mark.parametrize("add_bias", [True, False])
+@pytest.mark.parametrize("mp_size", [1, 2])
+@pytest.mark.parametrize("pre_attn_norm", [True, False])
+def test_residual_add(inference_module,
+                      batch,
+                      sequence,
+                      hidden_dim,
+                      dtype,
+                      mlp_after_attn,
+                      add_bias,
+                      mp_size,
+                      pre_attn_norm):
+    ds_out = torch.randn((batch,
+                          sequence,
+                          hidden_dim),
+                         dtype=dtype,
+                         device=get_accelerator().device_name())
+    residual = torch.randn((batch,
+                            sequence,
+                            hidden_dim),
+                           dtype=dtype,
+                           device=get_accelerator().device_name())
+    attn_output = torch.randn((batch,
+                               sequence,
+                               hidden_dim),
+                              dtype=dtype,
+                              device=get_accelerator().device_name())
+    final_bias = torch.randn((hidden_dim),
+                             dtype=dtype,
+                             device=get_accelerator().device_name())
+    attn_bias = torch.randn((hidden_dim),
+                            dtype=dtype,
+                            device=get_accelerator().device_name())
+
+    ref_out = ds_out.clone()
+    ref_out = run_residual_add_reference(ref_out,
+                                         residual,
+                                         attn_output,
+                                         attn_bias,
+                                         final_bias,
+                                         mlp_after_attn,
+                                         add_bias,
+                                         mp_size,
+                                         pre_attn_norm)
+
+    res_add_args = [
+        ds_out,
+        residual,
+        attn_output,
+        attn_bias,
+        final_bias,
+        mp_size,
+        mlp_after_attn,
+        add_bias,
+        pre_attn_norm
+    ]
+
+    if dtype == torch.float16:
+        ds_out = inference_module.residual_add_bias_fp16(*res_add_args)
+    elif dtype == torch.float32:
+        ds_out = inference_module.residual_add_bias_fp32(*res_add_args)
+    else:
+        raise ValueError(f"Unsupported dtype: {dtype}")
+
+    assert (allclose(ds_out, ref_out))
diff --git a/tests/unit/pipe/test_pipe_module.py b/tests/unit/pipe/test_pipe_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8404b0d5a17d229684f3e17acc01e4cb79e81a9
--- /dev/null
+++ b/tests/unit/pipe/test_pipe_module.py
@@ -0,0 +1,101 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import copy
+
+import torch
+import torch.nn as nn
+import deepspeed.comm as dist
+
+import pytest
+
+import deepspeed
+from deepspeed.pipe import PipelineModule
+from deepspeed.utils import RepeatingLoader
+from deepspeed.accelerator import get_accelerator
+
+from unit.common import DistributedTest
+
+HIDDEN_DIM = 32
+LAYERS = 8
+
+
+@pytest.fixture
+def sequential_model():
+    model = torch.nn.Sequential(
+        *[nn.Linear(HIDDEN_DIM,
+                    HIDDEN_DIM) for _ in range(LAYERS)],
+        nn.Linear(HIDDEN_DIM,
+                  1),
+    )
+    return model
+
+
+@pytest.fixture
+def simple_config():
+    config_dict = {
+        "train_batch_size": 1,
+        "train_micro_batch_size_per_gpu": 1,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.001,
+                "betas": [0.9,
+                          0.999],
+                "eps": 1e-8,
+                "weight_decay": 3e-7
+            }
+        },
+        "pipeline": {
+            "activation_checkpoint_interval": 1
+        }
+    }
+    return config_dict
+
+
+@pytest.fixture
+def batch_input():
+    return torch.randn(1, HIDDEN_DIM)
+
+
+class TestPipeModuleSequential(DistributedTest):
+    world_size = 2
+
+    def test(self, sequential_model, simple_config, batch_input):
+        base_model = copy.deepcopy(sequential_model)
+        base_input = batch_input.clone().detach()
+        base_output = base_model(base_input)
+        base_output = base_output
+        base_params = sum(p.numel() for p in base_model.parameters())
+
+        pipe_model = copy.deepcopy(sequential_model)
+        pipe_model = PipelineModule(layers=pipe_model, num_stages=2)
+
+        # Ensure all parameters are accounted for.
+        my_params = sum(p.numel() for p in pipe_model.parameters())
+        total_pipe_params = torch.LongTensor([my_params
+                                              ]).to(get_accelerator().device_name())
+        dist.all_reduce(total_pipe_params)
+        total_pipe_params = total_pipe_params.item()
+        assert total_pipe_params == base_params
+
+        pipe_model, _, _, _ = deepspeed.initialize(
+            config=simple_config,
+            model=pipe_model,
+            model_parameters=[p for p in pipe_model.parameters()])
+
+        if pipe_model.is_first_stage or pipe_model.is_last_stage:
+            pipe_input = base_input.clone().detach().to(get_accelerator().device_name())
+            # label 0 is meaningless
+            dataset = [(pipe_input, 0)]
+            loader = RepeatingLoader(dataset)
+            data_iter = iter(loader)
+        else:
+            data_iter = None
+
+        pipe_output = pipe_model.eval_batch(data_iter=data_iter)
+
+        base_output = base_output.to('cpu')
+        pipe_output = pipe_output.to('cpu')
+
+        assert torch.allclose(base_output, pipe_output, atol=1e-4)
diff --git a/tests/unit/profiling/flops_profiler/test_flops_profiler.py b/tests/unit/profiling/flops_profiler/test_flops_profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f93533587c0574b0ee9df690796c57660f05f44
--- /dev/null
+++ b/tests/unit/profiling/flops_profiler/test_flops_profiler.py
@@ -0,0 +1,128 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import pytest
+import deepspeed
+from deepspeed.profiling.flops_profiler import get_model_profile
+from unit.simple_model import SimpleModel, random_dataloader
+from unit.common import DistributedTest
+
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+pytestmark = pytest.mark.skipif(TORCH_MAJOR < 1
+                                or (TORCH_MAJOR == 1 and TORCH_MINOR < 3),
+                                reason='requires Pytorch version 1.3 or above')
+
+
+def within_range(val, target, tolerance):
+    return abs(val - target) / target < tolerance
+
+
+TOLERANCE = 0.05
+
+
+class LeNet5(torch.nn.Module):
+    def __init__(self, n_classes):
+        super(LeNet5, self).__init__()
+
+        self.feature_extractor = torch.nn.Sequential(
+            torch.nn.Conv2d(in_channels=1,
+                            out_channels=6,
+                            kernel_size=5,
+                            stride=1),
+            torch.nn.Tanh(),
+            torch.nn.AvgPool2d(kernel_size=2),
+            torch.nn.Conv2d(in_channels=6,
+                            out_channels=16,
+                            kernel_size=5,
+                            stride=1),
+            torch.nn.Tanh(),
+            torch.nn.AvgPool2d(kernel_size=2),
+            torch.nn.Conv2d(in_channels=16,
+                            out_channels=120,
+                            kernel_size=5,
+                            stride=1),
+            torch.nn.Tanh(),
+        )
+
+        self.classifier = torch.nn.Sequential(
+            torch.nn.Linear(in_features=120,
+                            out_features=84),
+            torch.nn.Tanh(),
+            torch.nn.Linear(in_features=84,
+                            out_features=n_classes),
+        )
+
+    def forward(self, x):
+        x = self.feature_extractor(x)
+        x = torch.flatten(x, 1)
+        logits = self.classifier(x)
+        probs = torch.nn.functional.softmax(logits, dim=1)
+        return logits, probs
+
+
+class TestFlopsProfiler(DistributedTest):
+    world_size = 1
+
+    def test(self):
+        config_dict = {
+            "train_batch_size": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.001,
+                }
+            },
+            "zero_optimization": {
+                "stage": 0
+            },
+            "fp16": {
+                "enabled": True,
+            },
+            "flops_profiler": {
+                "enabled": True,
+                "step": 1,
+                "module_depth": -1,
+                "top_modules": 3,
+            },
+        }
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim, empty_grad=False)
+
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                            model=model,
+                                            model_parameters=model.parameters())
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.half)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+            if n == 3: break
+        assert within_range(model.flops_profiler.flops, 200, tolerance=TOLERANCE)
+        assert model.flops_profiler.params == 110
+
+    def test_flops_profiler_in_inference(self):
+        mod = LeNet5(10)
+        batch_size = 1024
+        input = torch.randn(batch_size, 1, 32, 32)
+        flops, macs, params = get_model_profile(
+            mod,
+            tuple(input.shape),
+            print_profile=True,
+            detailed=True,
+            module_depth=-1,
+            top_modules=3,
+            warm_up=1,
+            as_string=False,
+            ignore_modules=None,
+        )
+        print(flops, macs, params)
+        assert within_range(flops, 866076672, TOLERANCE)
+        assert within_range(macs, 426516480, TOLERANCE)
+        assert params == 61706
diff --git a/tests/unit/run_test.sh b/tests/unit/run_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a2ae7604f393dfa71f2d5db1f9318fa3e46f707f
--- /dev/null
+++ b/tests/unit/run_test.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+#pytest runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[9-True-1]
+#pytest runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[9-True-2]
+#pytest runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[9-True-3]
+#pytest runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[10-True-1]
+#pytest runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[10-True-2]
+#pytest runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[10-True-3]
+#pytest runtime/half_precision/test_fp16.py::TestZeroEmptyPartition::test[True-1]
+#pytest runtime/half_precision/test_fp16.py::TestZeroEmptyPartition::test[True-2]
+
+pytest checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_lr_scheduler[2-True]
+pytest checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_lr_scheduler[3-True]
+pytest checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_no_lr_scheduler[2-True]
+pytest checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_no_lr_scheduler[3-True]
+pytest checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_load_optimizer_state[2-True-deepspeed_adam]
+pytest checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_load_optimizer_state[3-True-deepspeed_adam]
+pytest checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_not_load_optimizer_state[2-True-deepspeed_adam]
+pytest checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_not_load_optimizer_state[3-True-deepspeed_adam]
diff --git a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py
new file mode 100644
index 0000000000000000000000000000000000000000..af354fe1caa6a77c95ad4f97fb6822f752a4b741
--- /dev/null
+++ b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py
@@ -0,0 +1,267 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+# TODO: add tests with model parallelism for activation partitioning and other features.
+
+import pytest
+import torch
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from copy import deepcopy
+from unit.common import DistributedTest
+
+ckpt = deepspeed.checkpointing.checkpoint
+
+
+def _compute(module, *inputs, do_checkpoint=False):
+    if do_checkpoint:
+        outputs = ckpt(module, *inputs)
+    else:
+        outputs = module(*inputs)
+
+    if torch.is_tensor(outputs):
+        outputs = (outputs, )
+
+    sum(o.sum() for o in outputs if torch.is_tensor(o) and o.requires_grad).backward()
+
+    grads = [p.grad for p in module.parameters()]
+    input_grads = [inp.grad for inp in inputs if torch.is_tensor(inp)]
+
+    return {
+        'outputs': outputs,
+        'module_grads': grads,
+        'input_grads': input_grads,
+    }
+
+
+def _prep_inputs(*inputs):
+    _inputs = []
+
+    for inp in inputs:
+        inp = deepcopy(inp)
+        if torch.is_tensor(inp):
+            inp = inp.to(get_accelerator().device_name())
+        _inputs.append(inp)
+
+    return tuple(_inputs)
+
+
+def _match_outputs(ref, tgt):
+    assert type(ref) == type(tgt)
+    if type(ref) in [list, tuple]:
+        for x, y in zip(ref, tgt):
+            _match_outputs(x, y)
+    elif not torch.is_tensor(ref):
+        assert ref == tgt
+    elif ref.is_floating_point():
+        assert torch.allclose(ref, tgt)
+    else:
+        assert torch.equal(ref, tgt)
+
+
+def _test_activation_checkpoint(module, *inputs):
+    # Move to device
+    module.to(get_accelerator().device_name())
+
+    # Get rid of dropouts until we fork the RNG between tests.
+    module.eval()
+
+    module_ = deepcopy(module)
+    inputs_ = _prep_inputs(*inputs)
+    base = _compute(module_, *inputs_, do_checkpoint=False)
+
+    module_ = deepcopy(module)
+    inputs_ = _prep_inputs(*inputs)
+    test = _compute(module_, *inputs_, do_checkpoint=True)
+
+    for group in base.keys():
+        for b, t in zip(base[group], test[group]):
+            _match_outputs(b, t)
+
+
+def _test_activation_checkpoint_ordering(module, expected_ordering, *inputs):
+    # Move to device
+    module.to(get_accelerator().device_name())
+
+    # Get rid of dropouts until we fork the RNG between tests.
+    module.eval()
+
+    module_ = deepcopy(module)
+    inputs_ = _prep_inputs(*inputs)
+    test = _compute(module_, *inputs_, do_checkpoint=True)
+
+    outputs = test['outputs']
+    test_ordering = []
+    for item in outputs:
+        if type(item) in [list, tuple]:
+            test_ordering += [torch.is_tensor(t) for t in item]
+        else:
+            test_ordering += [torch.is_tensor(item)]
+
+    assert expected_ordering == test_ordering
+
+
+#
+# Helpers
+#
+
+
+class MaskedLinear(torch.nn.Linear):
+    def forward(self, x, mask):
+        out = super().forward(x)
+        if mask.is_floating_point():
+            out = out * mask
+        else:
+            # must cast BoolTensor in older torch versions
+            out = out * mask.type_as(out)
+        return out
+
+
+class MaskedLinearSeq(MaskedLinear):
+    """Tests pipeline modules by also returning the mask."""
+    def forward(self, x, mask):
+        return super().forward(x, mask), mask
+
+
+class MaskedLinearSeqDup(MaskedLinearSeq):
+    """MaskedLinearSeq, but with more outputs than inputs and in a different order."""
+    def forward(self, x, mask):
+        dup = x.clone().detach() * 1.38  # just an arbitrary scaling
+        x, mask = super().forward(x, mask)
+        return dup, x, mask
+
+
+class DropMaskLinear(torch.nn.Linear):
+    def forward(self, x, mask):
+        return super().forward(x)
+
+
+class LinearNonTensorInput(torch.nn.Linear):
+    def forward(self, x, non_tensor_input):
+        return super().forward(x)
+
+
+class LinearNonTensorOutput(torch.nn.Linear):
+    def __init__(self, non_tensor_output):
+        super().__init__(HIDDEN_DIM, HIDDEN_DIM)
+        self.non_tensor_output = non_tensor_output
+
+    def forward(self, x):
+        out = super().forward(x)
+        return out, self.non_tensor_output
+
+
+HIDDEN_DIM = 20
+
+
+def _mixed_mask(size=HIDDEN_DIM):
+    entries = torch.randn(size)
+    mask = torch.where(entries > 0, torch.ones(size), torch.zeros(size))
+    mask = mask.bool()
+    return mask
+
+
+def _bool_to_float(btensor, dtype=torch.float32):
+    """Converts a torch.BoolTensor to an equivalent dtype. """
+    ones = torch.ones(size=btensor.size(), dtype=dtype)
+    zeros = torch.zeros(size=btensor.size(), dtype=dtype)
+    return torch.where(btensor, ones, zeros)
+
+
+#
+# Tests
+#
+
+
+# both bool and float are important, as bool is not differentiable
+@pytest.mark.parametrize('mask',
+                         [
+                             _mixed_mask(),
+                             _bool_to_float(_mixed_mask()),
+                         ])
+class TestActivationCheckpoint(DistributedTest):
+    world_size = 1
+
+    def test_ckpt_inputs1_outputs1(self, mask):
+        module = torch.nn.Linear(HIDDEN_DIM, HIDDEN_DIM)
+        inputs = torch.rand(HIDDEN_DIM)
+        inputs.requires_grad = True
+        _test_activation_checkpoint(module, inputs)
+
+    def test_ckpt_inputs2_outputs1(self, mask):
+        module = MaskedLinear(HIDDEN_DIM, HIDDEN_DIM)
+        inputs = torch.rand(HIDDEN_DIM)
+        inputs.requires_grad = True
+        _test_activation_checkpoint(module, inputs, mask)
+
+    def test_ckpt_inputs2_outputs2(self, mask):
+        module = MaskedLinearSeq(HIDDEN_DIM, HIDDEN_DIM)
+        inputs = torch.rand(HIDDEN_DIM)
+        inputs.requires_grad = True
+        _test_activation_checkpoint(module, inputs, mask)
+
+    def test_ckpt_inputs2_outputs3(self, mask):
+        module = MaskedLinearSeqDup(HIDDEN_DIM, HIDDEN_DIM)
+        inputs = torch.rand(HIDDEN_DIM)
+        inputs.requires_grad = True
+        _test_activation_checkpoint(module, inputs, mask)
+
+    def test_ckpt_arg_none(self, mask):
+        module = DropMaskLinear(HIDDEN_DIM, HIDDEN_DIM)
+        inputs = (torch.rand(HIDDEN_DIM), None)
+        inputs[0].requires_grad = True
+        _test_activation_checkpoint(module, *inputs)
+
+
+@pytest.mark.parametrize(
+    'non_tensor',
+    [None,
+     2,
+     True,
+     (None,
+      2.5),
+     (None,
+      True,
+      torch.randn(HIDDEN_DIM))])
+class TestCheckpointNonTensor(DistributedTest):
+    world_size = 1
+
+    def test_ckpt_non_tensor_input(self, non_tensor):
+        module = LinearNonTensorInput(HIDDEN_DIM, HIDDEN_DIM)
+        inputs = torch.rand(HIDDEN_DIM)
+        inputs.requires_grad = True
+        _test_activation_checkpoint(module, inputs, non_tensor)
+
+    def test_ckpt_non_tensor_output(self, non_tensor):
+        module = LinearNonTensorOutput(non_tensor)
+        inputs = torch.rand(HIDDEN_DIM)
+        inputs.requires_grad = True
+        _test_activation_checkpoint(module, inputs)
+
+
+@pytest.mark.parametrize('non_tensor_output',
+                         [
+                             None,
+                             (torch.randn(HIDDEN_DIM),
+                              2.5),
+                             (None,
+                              torch.randn(HIDDEN_DIM),
+                              True),
+                             (None,
+                              True,
+                              torch.randn(HIDDEN_DIM))
+                         ])
+class TestCheckpointNonTensorOutputOrdering(DistributedTest):
+    world_size = 1
+
+    def test_ckpt_non_tensor_output_ordering(self, non_tensor_output):
+        module = LinearNonTensorOutput(non_tensor_output)
+        inputs = torch.rand(HIDDEN_DIM)
+        inputs.requires_grad = True
+
+        # First return is a tensor
+        ordering = [True]
+        if type(non_tensor_output) in [list, tuple]:
+            ordering += [torch.is_tensor(t) for t in non_tensor_output]
+        else:
+            ordering += [torch.is_tensor(non_tensor_output)]
+        _test_activation_checkpoint_ordering(module, ordering, inputs)
diff --git a/tests/unit/runtime/comm/test_coalesced_collectives.py b/tests/unit/runtime/comm/test_coalesced_collectives.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa1041379a6b8f1d70c0b5b3be52f0c0a0fa88b2
--- /dev/null
+++ b/tests/unit/runtime/comm/test_coalesced_collectives.py
@@ -0,0 +1,70 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+"""unit tests for coalesced collectives"""
+
+import torch
+import deepspeed.comm as dist
+from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced
+from deepspeed.accelerator import get_accelerator
+
+from unit.common import DistributedTest
+
+
+class TestReduceScatterCoalesced(DistributedTest):
+    world_size = 2
+
+    def test_single_input(self):
+        input = torch.full((6,
+                            ),
+                           dist.get_rank(),
+                           dtype=torch.half,
+                           device=get_accelerator().current_device_name())
+
+        (output, ) = reduce_scatter_coalesced([input], dist.get_world_group())
+
+        assert output.shape == (3, )
+        assert torch.allclose(output, torch.full_like(output, 0.5))
+
+    def test_two_inputs(self):
+        tensor_kwargs = {
+            "device": get_accelerator().current_device_name(),
+            "dtype": torch.half
+        }
+        inputs = [
+            dist.get_rank() * torch.arange(0,
+                                           6,
+                                           **tensor_kwargs),
+            dist.get_rank() * torch.arange(6,
+                                           9,
+                                           **tensor_kwargs),
+        ]
+
+        output1, output2 = reduce_scatter_coalesced(inputs, dist.get_world_group())
+
+        if dist.get_rank() == 0:
+            assert output1.shape == (3, )
+            assert torch.allclose(output1, torch.arange(0, 3, **tensor_kwargs) / 2)
+            assert output2.shape == (2, )
+            assert torch.allclose(output2, torch.arange(6, 8, **tensor_kwargs) / 2)
+        elif dist.get_rank() == 1:
+            assert output1.shape == (3, )
+            assert torch.allclose(output1, torch.arange(3, 6, **tensor_kwargs) / 2)
+            assert output2.shape == (1, )
+            assert torch.allclose(output2, torch.arange(8, 9, **tensor_kwargs) / 2)
+
+
+class TestReduceScatterCoalescedTensorSmallerThanWorldSize(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        input = torch.zeros((1,
+                             ),
+                            dtype=torch.half,
+                            device=get_accelerator().current_device_name())
+
+        (output, ) = reduce_scatter_coalesced([input], dist.get_world_group())
+
+        if dist.get_rank() == 0:
+            assert output.shape == (1, )
+            assert torch.allclose(output, torch.zeros_like(output))
+        elif dist.get_rank() == 1:
+            assert output.shape == (0, )
diff --git a/tests/unit/runtime/half_precision/onebit/test_onebit.py b/tests/unit/runtime/half_precision/onebit/test_onebit.py
new file mode 100644
index 0000000000000000000000000000000000000000..84a36768174ae9dc7825098e91014da01528cd4b
--- /dev/null
+++ b/tests/unit/runtime/half_precision/onebit/test_onebit.py
@@ -0,0 +1,1308 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import torch.nn as nn
+import deepspeed.comm as dist
+import deepspeed
+import pytest
+import copy
+import os
+import numpy as np
+
+from deepspeed.runtime.pipe.topology import PipeDataParallelTopology
+from deepspeed.ops.op_builder import OpBuilder
+from deepspeed.runtime.pipe.module import PipelineModule
+from unit.common import DistributedTest
+from unit.simple_model import SimpleModel, random_dataloader
+from unit.alexnet_model import AlexNetPipe, train_cifar
+from deepspeed.accelerator import get_accelerator
+
+PipeTopo = PipeDataParallelTopology
+
+TORCH_MAJOR = int(torch.__version__.split(".")[0])
+TORCH_MINOR = int(torch.__version__.split(".")[1])
+if TORCH_MAJOR < 1 or TORCH_MINOR < 8:
+    pytest.skip(
+        "NCCL-based 1-bit compression requires torch 1.8 or higher",
+        allow_module_level=True,
+    )
+
+rocm_version = OpBuilder.installed_rocm_version()
+if rocm_version[0] > 4:
+    pytest.skip(
+        "NCCL-based 1-bit compression is not yet supported w. ROCm 5 until cupy supports ROCm 5",
+        allow_module_level=True)
+
+
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"])
+class TestOneBitAdamBasic(DistributedTest):
+    world_size = 2
+
+    def test(self, dtype):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "OneBitAdam",
+                "params": {
+                    "lr": 0.00015,
+                    "weight_decay": 0.01,
+                    "freeze_step": 2,
+                    "cuda_aware": False,
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
+                },
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": (dtype == torch.float16),
+                "loss_scale": 0,
+                "initial_scale_power": 16,
+            },
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(
+            config=config_dict, model=model, model_parameters=model.parameters()
+        )
+        data_loader = random_dataloader(
+            model=model,
+            total_samples=50,
+            hidden_dim=hidden_dim,
+            device=model.device,
+            dtype=dtype,
+        )
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+class TestOneBitAdamExpAvgMask(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "OneBitAdam",
+                "params": {
+                    "lr": 0.00015,
+                    "weight_decay": 0.01,
+                    "freeze_step": 2,
+                    "cuda_aware": False,
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
+                },
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 16
+            },
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        param_optimizer = list(model.named_parameters())
+        mask1 = torch.zeros_like(param_optimizer[0][1].data)
+        for col in range(mask1.size()[1]):
+            mask1[0][col] += 1
+        mask1 = torch.flatten(mask1)
+        optimizer_grouped_parameters = [
+            {
+                "params": [param_optimizer[0][1]],
+                "weight_decay": 0.01,
+                "exp_avg_mask": mask1,
+            },
+            {
+                "params": [param_optimizer[1][1]],
+                "weight_decay": 0.01
+            },
+        ]
+
+        model, optimizer, _, _ = deepspeed.initialize(
+            config=config_dict,
+            model=model,
+            model_parameters=optimizer_grouped_parameters,
+        )
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+        # Test whether the momentum mask works
+        for v in optimizer.state.values():
+            if v["exp_avg"].size() == mask1.size():
+                assert torch.allclose(
+                    v["exp_avg"],
+                    v["exp_avg"].mul_(mask1.to(device=v["exp_avg"].device)),
+                    atol=1e-07,
+                ), f"Momentum mask is not working properly"
+
+
+class TestOneBitAdamCheckpointing(DistributedTest):
+    world_size = 2
+
+    def test(self, tmpdir):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "OneBitAdam",
+                "params": {
+                    "lr": 0.00015,
+                    "weight_decay": 0.01,
+                    "freeze_step": 2,
+                    "cuda_aware": False,
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
+                },
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 16
+            },
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        param_optimizer = list(model.named_parameters())
+        mask1 = torch.zeros_like(param_optimizer[0][1].data)
+        mask2 = torch.zeros_like(param_optimizer[0][1].data)
+        for col in range(mask1.size()[1]):
+            mask1[0][col] += 1
+            mask2[1][col] += 1
+        mask1 = torch.flatten(mask1)
+        mask2 = torch.flatten(mask2)
+
+        optimizer_grouped_parameters_1 = [
+            {
+                "params": [param_optimizer[0][1]],
+                "weight_decay": 0.01,
+                "exp_avg_mask": mask1,
+            },
+            {
+                "params": [param_optimizer[1][1]],
+                "weight_decay": 0.01
+            },
+        ]
+
+        optimizer_grouped_parameters_2 = [
+            {
+                "params": [param_optimizer[0][1]],
+                "weight_decay": 0.01,
+                "exp_avg_mask": mask2,
+            },
+            {
+                "params": [param_optimizer[1][1]],
+                "weight_decay": 0.01
+            },
+        ]
+
+        optimizer_grouped_parameters_3 = [
+            {
+                "params": [param_optimizer[0][1]],
+                "weight_decay": 0.01
+            },
+            {
+                "params": [param_optimizer[1][1]],
+                "weight_decay": 0.01
+            },
+        ]
+
+        model_1, optimizer_1, _, _ = deepspeed.initialize(
+            config=config_dict,
+            model=model,
+            model_parameters=optimizer_grouped_parameters_1,
+        )
+        data_loader = random_dataloader(
+            model=model_1,
+            total_samples=10,
+            hidden_dim=hidden_dim,
+            device=model_1.device,
+        )
+        for n, batch in enumerate(data_loader):
+            loss = model_1(batch[0], batch[1])
+            model_1.backward(loss)
+            model_1.step()
+        # Test whether momentum mask still exist after saving checkpoint
+        assert optimizer_1.optimizer.adam_freeze_key is True
+        mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device)
+        assert torch.allclose(
+            optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07
+        ), f"Incorrect momentum mask"
+        save_folder = os.path.join(tmpdir, "saved_checkpoint")
+        model_1.save_checkpoint(save_folder, tag=None)
+        assert torch.allclose(
+            optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07
+        ), f"Momentum mask should not change after saving checkpoint"
+
+        model_2, optimizer_2, _, _ = deepspeed.initialize(
+            config=config_dict,
+            model=model,
+            model_parameters=optimizer_grouped_parameters_2,
+        )
+        # Test whether momentum mask stays the same after loading checkpoint
+        mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device)
+        assert torch.allclose(
+            optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07
+        ), f"Incorrect momentum mask"
+        model_2.load_checkpoint(
+            save_folder,
+            tag=None,
+            load_optimizer_states=True,
+            load_lr_scheduler_states=True,
+        )
+        assert torch.allclose(
+            optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07
+        ), f"Momentum mask should not change after loading checkpoint"
+        # Test whether worker&server error is reset
+        for v in optimizer_2.state.values():
+            assert "worker_error" not in v, f"Incorrect worker error"
+            assert "server_error" not in v, f"Incorrect server error"
+        assert optimizer_2.optimizer.adam_freeze_key is True
+
+        model_3, optimizer_3, _, _ = deepspeed.initialize(
+            config=config_dict,
+            model=model,
+            model_parameters=optimizer_grouped_parameters_3,
+        )
+        optimizer_3.optimizer.freeze_step = 20
+        data_loader = random_dataloader(
+            model=model_3,
+            total_samples=50,
+            hidden_dim=hidden_dim,
+            device=model_3.device,
+        )
+        for n, batch in enumerate(data_loader):
+            loss = model_3(batch[0], batch[1])
+            model_3.backward(loss)
+            model_3.step()
+        assert optimizer_3.optimizer.adam_freeze_key is True
+        # Test whether momentum mask stays the same after loading checkpoint
+        assert (
+            "exp_avg_mask" not in optimizer_3.param_groups[0]
+        ), f"Incorrect momentum mask"
+        model_3.load_checkpoint(
+            save_folder,
+            tag=None,
+            load_optimizer_states=True,
+            load_lr_scheduler_states=True,
+        )
+        assert (
+            "exp_avg_mask" not in optimizer_3.param_groups[0]
+        ), f"Momentum mask should not change after loading checkpoint"
+        # Test whether worker&server error is reset
+        for v in optimizer_3.state.values():
+            assert "worker_error" not in v, f"Incorrect worker error"
+            assert "server_error" not in v, f"Incorrect server error"
+        assert optimizer_3.optimizer.adam_freeze_key is False
+
+    def test_overflow(self, tmpdir):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "OneBitAdam",
+                "params": {
+                    "lr": 0.00015,
+                    "weight_decay": 0.01,
+                    "freeze_step": 2,
+                    "cuda_aware": False,
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
+                },
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 16
+            },
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(
+            config=config_dict, model=model, model_parameters=model.parameters()
+        )
+        data_loader = random_dataloader(model=model,
+                                        total_samples=100,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        save_folder = os.path.join(tmpdir, "saved_checkpoint")
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            if dist.get_rank() == 0 and n >= 10:
+                loss = loss * 1000000.0
+            model.backward(loss)
+            dist.barrier()
+            model.step()
+            dist.barrier()
+            model.save_checkpoint(save_folder, tag=None)
+
+
+@pytest.mark.parametrize(
+    "topo_config",
+    [
+        {
+            "num_pp": 1,
+            "num_dp": 4
+        },
+        {
+            "num_pp": 2,
+            "num_dp": 2
+        },
+        {
+            "num_pp": 4,
+            "num_dp": 1
+        },
+    ],
+)
+class TestOneBitAdamFP16Pipeline(DistributedTest):
+    world_size = 4
+
+    def test(self, topo_config):
+        config_dict = {
+            "train_batch_size": 16,
+            "train_micro_batch_size_per_gpu": 4,
+            "steps_per_print": 20,
+            "optimizer": {
+                "type": "OneBitAdam",
+                "params": {
+                    "lr": 0.00001,
+                    "betas": [0.9,
+                              0.999],
+                    "eps": 1e-8,
+                    "weight_decay": 3e-7,
+                    "freeze_step": 200,
+                    "cuda_aware": False,
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
+                },
+            },
+            "gradient_clipping": 1.0,
+            "zero_optimization": {
+                "stage": 0
+            },
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 16
+            },
+            "pipeline": {
+                "seed_layers": True,
+                "activation_checkpoint_interval": 1
+            },
+        }
+
+        topo = PipeTopo(**topo_config)
+        steps = 500  # Must be >=100
+
+        # Allocate model for consistent initial weights.
+        init_net = AlexNetPipe()
+
+        test_net = copy.deepcopy(init_net)
+        test_model = PipelineModule(layers=test_net.to_layers(),
+                                    topology=topo,
+                                    loss_fn=nn.CrossEntropyLoss())
+
+        test_losses = train_cifar(
+            test_model,
+            config=config_dict,
+            num_steps=steps,
+            fp16=config_dict["fp16"]["enabled"],
+        )
+
+
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"])
+class TestZeroOneAdamBasic(DistributedTest):
+    world_size = 2
+
+    def test(self, dtype):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "ZeroOneAdam",
+                "params": {
+                    "lr": 0.00015,
+                    "weight_decay": 0.01,
+                    "var_freeze_step": 4,
+                    "var_update_scaler": 1,
+                    "local_step_scaler": 1,
+                    "local_step_clipper": 2,
+                    "cuda_aware": False,
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
+                },
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": (dtype == torch.float16),
+                "loss_scale": 0,
+                "initial_scale_power": 16,
+            },
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(
+            config=config_dict, model=model, model_parameters=model.parameters()
+        )
+        data_loader = random_dataloader(
+            model=model,
+            total_samples=50,
+            hidden_dim=hidden_dim,
+            device=model.device,
+            dtype=dtype,
+        )
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+class TestZeroOneAdamExpAvgMask(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "ZeroOneAdam",
+                "params": {
+                    "lr": 0.00015,
+                    "weight_decay": 0.01,
+                    "var_freeze_step": 4,
+                    "var_update_scaler": 1,
+                    "local_step_scaler": 1,
+                    "local_step_clipper": 2,
+                    "cuda_aware": False,
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
+                },
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 16
+            },
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        param_optimizer = list(model.named_parameters())
+        mask1 = torch.zeros_like(param_optimizer[0][1].data)
+        for col in range(mask1.size()[1]):
+            mask1[0][col] += 1
+        mask1 = torch.flatten(mask1)
+        optimizer_grouped_parameters = [
+            {
+                "params": [param_optimizer[0][1]],
+                "weight_decay": 0.01,
+                "exp_avg_mask": mask1,
+            },
+            {
+                "params": [param_optimizer[1][1]],
+                "weight_decay": 0.01
+            },
+        ]
+
+        model, optimizer, _, _ = deepspeed.initialize(
+            config=config_dict,
+            model=model,
+            model_parameters=optimizer_grouped_parameters,
+        )
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+        # Test whether the momentum mask works
+        for v in optimizer.state.values():
+            if v["exp_avg"].size() == mask1.size():
+                assert torch.allclose(
+                    v["exp_avg"],
+                    v["exp_avg"].mul_(mask1.to(device=v["exp_avg"].device)),
+                    atol=1e-07,
+                ), f"Momentum mask is not working properly"
+
+
+class TestZeroOneAdamCheckpointing(DistributedTest):
+    world_size = 2
+
+    def test(self, tmpdir):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "ZeroOneAdam",
+                "params": {
+                    "lr": 0.00015,
+                    "weight_decay": 0.01,
+                    "var_freeze_step": 4,
+                    "var_update_scaler": 1,
+                    "local_step_scaler": 1,
+                    "local_step_clipper": 2,
+                    "cuda_aware": False,
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
+                },
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 16
+            },
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        param_optimizer = list(model.named_parameters())
+        mask1 = torch.zeros_like(param_optimizer[0][1].data)
+        mask2 = torch.zeros_like(param_optimizer[0][1].data)
+        for col in range(mask1.size()[1]):
+            mask1[0][col] += 1
+            mask2[1][col] += 1
+        mask1 = torch.flatten(mask1)
+        mask2 = torch.flatten(mask2)
+
+        optimizer_grouped_parameters_1 = [
+            {
+                "params": [param_optimizer[0][1]],
+                "weight_decay": 0.01,
+                "exp_avg_mask": mask1,
+            },
+            {
+                "params": [param_optimizer[1][1]],
+                "weight_decay": 0.01
+            },
+        ]
+
+        optimizer_grouped_parameters_2 = [
+            {
+                "params": [param_optimizer[0][1]],
+                "weight_decay": 0.01,
+                "exp_avg_mask": mask2,
+            },
+            {
+                "params": [param_optimizer[1][1]],
+                "weight_decay": 0.01
+            },
+        ]
+
+        optimizer_grouped_parameters_3 = [
+            {
+                "params": [param_optimizer[0][1]],
+                "weight_decay": 0.01
+            },
+            {
+                "params": [param_optimizer[1][1]],
+                "weight_decay": 0.01
+            },
+        ]
+
+        model_1, optimizer_1, _, _ = deepspeed.initialize(
+            config=config_dict,
+            model=model,
+            model_parameters=optimizer_grouped_parameters_1,
+        )
+        data_loader = random_dataloader(
+            model=model_1,
+            total_samples=10,
+            hidden_dim=hidden_dim,
+            device=model_1.device,
+        )
+        for n, batch in enumerate(data_loader):
+            loss = model_1(batch[0], batch[1])
+            model_1.backward(loss)
+            model_1.step()
+        # Test whether momentum mask still exist after saving checkpoint
+        mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device)
+        assert torch.allclose(
+            optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07
+        ), f"Incorrect momentum mask"
+        save_folder = os.path.join(tmpdir, "saved_checkpoint")
+        model_1.save_checkpoint(save_folder, tag=None)
+        assert torch.allclose(
+            optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07
+        ), f"Momentum mask should not change after saving checkpoint"
+
+        model_2, optimizer_2, _, _ = deepspeed.initialize(
+            config=config_dict,
+            model=model,
+            model_parameters=optimizer_grouped_parameters_2,
+        )
+        # Test whether momentum mask stays the same after loading checkpoint
+        mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device)
+        assert torch.allclose(
+            optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07
+        ), f"Incorrect momentum mask"
+        model_2.load_checkpoint(
+            save_folder,
+            tag=None,
+            load_optimizer_states=True,
+            load_lr_scheduler_states=True,
+        )
+        assert torch.allclose(
+            optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07
+        ), f"Momentum mask should not change after loading checkpoint"
+        # Test whether worker&server error is reset
+        for v in optimizer_2.state.values():
+            assert "worker_error" not in v, f"Incorrect worker error"
+            assert "server_error" not in v, f"Incorrect server error"
+
+        model_3, optimizer_3, _, _ = deepspeed.initialize(
+            config=config_dict,
+            model=model,
+            model_parameters=optimizer_grouped_parameters_3,
+        )
+        optimizer_3.optimizer.freeze_step = 20
+        data_loader = random_dataloader(
+            model=model_3,
+            total_samples=50,
+            hidden_dim=hidden_dim,
+            device=model_3.device,
+        )
+        for n, batch in enumerate(data_loader):
+            loss = model_3(batch[0], batch[1])
+            model_3.backward(loss)
+            model_3.step()
+        # Test whether momentum mask stays the same after loading checkpoint
+        assert (
+            "exp_avg_mask" not in optimizer_3.param_groups[0]
+        ), f"Incorrect momentum mask"
+        model_3.load_checkpoint(
+            save_folder,
+            tag=None,
+            load_optimizer_states=True,
+            load_lr_scheduler_states=True,
+        )
+        assert (
+            "exp_avg_mask" not in optimizer_3.param_groups[0]
+        ), f"Momentum mask should not change after loading checkpoint"
+        # Test whether worker&server error is reset
+        for v in optimizer_3.state.values():
+            assert "worker_error" not in v, f"Incorrect worker error"
+            assert "server_error" not in v, f"Incorrect server error"
+
+    def test_overflow(self, tmpdir):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "ZeroOneAdam",
+                "params": {
+                    "lr": 0.00015,
+                    "weight_decay": 0.01,
+                    "var_freeze_step": 4,
+                    "var_update_scaler": 1,
+                    "local_step_scaler": 1,
+                    "local_step_clipper": 2,
+                    "cuda_aware": False,
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
+                },
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 16
+            },
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(
+            config=config_dict, model=model, model_parameters=model.parameters()
+        )
+        data_loader = random_dataloader(model=model,
+                                        total_samples=100,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        save_folder = os.path.join(tmpdir, "saved_checkpoint")
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            if dist.get_rank() == 0 and n >= 10:
+                loss = loss * 1000000.0
+            model.backward(loss)
+            dist.barrier()
+            model.step()
+            dist.barrier()
+            model.save_checkpoint(save_folder, tag=None)
+
+
+@pytest.mark.parametrize(
+    "topo_config",
+    [
+        {
+            "num_pp": 1,
+            "num_dp": 4
+        },
+        {
+            "num_pp": 2,
+            "num_dp": 2
+        },
+        {
+            "num_pp": 4,
+            "num_dp": 1
+        },
+    ],
+)
+class TestZeroOneAdamFP16Pipeline(DistributedTest):
+    world_size = 4
+
+    def test(self, topo_config):
+        config_dict = {
+            "train_batch_size": 16,
+            "train_micro_batch_size_per_gpu": 4,
+            "steps_per_print": 20,
+            "optimizer": {
+                "type": "ZeroOneAdam",
+                "params": {
+                    "lr": 0.00001,
+                    "betas": [0.9,
+                              0.999],
+                    "eps": 1e-8,
+                    "weight_decay": 3e-7,
+                    "var_freeze_step": 4,
+                    "var_update_scaler": 1,
+                    "local_step_scaler": 1,
+                    "local_step_clipper": 2,
+                    "cuda_aware": False,
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
+                },
+            },
+            "gradient_clipping": 1.0,
+            "zero_optimization": {
+                "stage": 0
+            },
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 16
+            },
+            "pipeline": {
+                "seed_layers": True,
+                "activation_checkpoint_interval": 1
+            },
+        }
+
+        topo = PipeTopo(**topo_config)
+        steps = 500  # Must be >=100
+
+        # Allocate model for consistent initial weights.
+        init_net = AlexNetPipe()
+
+        test_net = copy.deepcopy(init_net)
+        test_model = PipelineModule(layers=test_net.to_layers(),
+                                    topology=topo,
+                                    loss_fn=nn.CrossEntropyLoss())
+
+        test_losses = train_cifar(
+            test_model,
+            config=config_dict,
+            num_steps=steps,
+            fp16=config_dict["fp16"]["enabled"],
+        )
+
+
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"])
+class TestOneBitLambBasic(DistributedTest):
+    world_size = 2
+
+    def test(self, dtype):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "OneBitLamb",
+                "params": {
+                    "lr": 0.00015,
+                    "weight_decay": 0.01,
+                    "max_coeff": 0.3,
+                    "min_coeff": 0.01,
+                    "freeze_step": 2,
+                    "cuda_aware": False,
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
+                    "coeff_beta": 0.9,
+                    "factor_max": 1.0,
+                    "factor_min": 0.5,
+                    "factor_threshold": 0.1,
+                },
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": (dtype == torch.float16),
+                "loss_scale": 0,
+                "initial_scale_power": 16,
+            },
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(
+            config=config_dict, model=model, model_parameters=model.parameters()
+        )
+        data_loader = random_dataloader(
+            model=model,
+            total_samples=50,
+            hidden_dim=hidden_dim,
+            device=model.device,
+            dtype=dtype,
+        )
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+class TestOneBitLampExpAvgMask(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "OneBitLamb",
+                "params": {
+                    "lr": 0.00015,
+                    "weight_decay": 0.01,
+                    "max_coeff": 0.3,
+                    "min_coeff": 0.01,
+                    "freeze_step": 2,
+                    "cuda_aware": False,
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
+                    "coeff_beta": 0.9,
+                    "factor_max": 1.0,
+                    "factor_min": 0.5,
+                    "factor_threshold": 0.1,
+                },
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 16
+            },
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        param_optimizer = list(model.named_parameters())
+        mask1 = torch.zeros_like(param_optimizer[0][1].data)
+        for col in range(mask1.size()[1]):
+            mask1[0][col] += 1
+        optimizer_grouped_parameters = [
+            {
+                "params": [param_optimizer[0][1]],
+                "weight_decay": 0.01,
+                "exp_avg_mask": mask1,
+            },
+            {
+                "params": [param_optimizer[1][1]],
+                "weight_decay": 0.01
+            },
+        ]
+
+        model, optimizer, _, _ = deepspeed.initialize(
+            config=config_dict,
+            model=model,
+            model_parameters=optimizer_grouped_parameters,
+        )
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+        # Test whether the momentum mask works
+        for v in optimizer.state.values():
+            if v["exp_avg"].size() == mask1.size():
+                assert torch.allclose(
+                    v["exp_avg"],
+                    v["exp_avg"].mul_(mask1.to(device=v["exp_avg"].device)),
+                    atol=1e-07,
+                ), f"Momentum mask is not working properly"
+
+
+class TestOneBitLambCheckpointing(DistributedTest):
+    world_size = 2
+
+    def test(self, tmpdir):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "OneBitLamb",
+                "params": {
+                    "lr": 0.00015,
+                    "weight_decay": 0.01,
+                    "max_coeff": 0.3,
+                    "min_coeff": 0.01,
+                    "freeze_step": 2,
+                    "cuda_aware": False,
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
+                    "coeff_beta": 0.9,
+                    "factor_max": 1.0,
+                    "factor_min": 0.5,
+                    "factor_threshold": 0.1,
+                },
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 16
+            },
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        param_optimizer = list(model.named_parameters())
+        mask1 = torch.zeros_like(param_optimizer[0][1].data)
+        mask2 = torch.zeros_like(param_optimizer[0][1].data)
+        for col in range(mask1.size()[1]):
+            mask1[0][col] += 1
+            mask2[1][col] += 1
+
+        optimizer_grouped_parameters_1 = [
+            {
+                "params": [param_optimizer[0][1]],
+                "weight_decay": 0.01,
+                "exp_avg_mask": mask1,
+            },
+            {
+                "params": [param_optimizer[1][1]],
+                "weight_decay": 0.01
+            },
+        ]
+
+        optimizer_grouped_parameters_2 = [
+            {
+                "params": [param_optimizer[0][1]],
+                "weight_decay": 0.01,
+                "exp_avg_mask": mask2,
+            },
+            {
+                "params": [param_optimizer[1][1]],
+                "weight_decay": 0.01
+            },
+        ]
+
+        optimizer_grouped_parameters_3 = [
+            {
+                "params": [param_optimizer[0][1]],
+                "weight_decay": 0.01
+            },
+            {
+                "params": [param_optimizer[1][1]],
+                "weight_decay": 0.01
+            },
+        ]
+
+        model_1, optimizer_1, _, _ = deepspeed.initialize(
+            config=config_dict,
+            model=model,
+            model_parameters=optimizer_grouped_parameters_1,
+        )
+        data_loader = random_dataloader(
+            model=model_1,
+            total_samples=10,
+            hidden_dim=hidden_dim,
+            device=model_1.device,
+        )
+        for n, batch in enumerate(data_loader):
+            loss = model_1(batch[0], batch[1])
+            model_1.backward(loss)
+            model_1.step()
+        # Test whether momentum mask still exist after saving checkpoint
+        assert optimizer_1.optimizer.lamb_freeze_key is True
+        mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device)
+        assert torch.allclose(
+            optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07
+        ), f"Incorrect momentum mask"
+        scaling_coeff_1 = []
+        for v in optimizer_1.state.values():
+            assert "scaling_coeff" in v, f"Incorrect scaling_coeff"
+            scaling_coeff_1.append(v["scaling_coeff"])
+        save_folder = os.path.join(tmpdir, "saved_checkpoint")
+        model_1.save_checkpoint(save_folder, tag=None)
+        assert torch.allclose(
+            optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07
+        ), f"Momentum mask should not change after saving checkpoint"
+
+        model_2, optimizer_2, _, _ = deepspeed.initialize(
+            config=config_dict,
+            model=model,
+            model_parameters=optimizer_grouped_parameters_2,
+        )
+        # Test whether momentum mask stays the same after loading checkpoint
+        mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device)
+        assert torch.allclose(
+            optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07
+        ), f"Incorrect momentum mask"
+        model_2.load_checkpoint(
+            save_folder,
+            tag=None,
+            load_optimizer_states=True,
+            load_lr_scheduler_states=True,
+        )
+        assert torch.allclose(
+            optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07
+        ), f"Momentum mask should not change after loading checkpoint"
+        # Test whether worker&server error is reset
+        assert len(optimizer_2.optimizer.worker_errors) == 0, f"Incorrect worker error"
+        assert len(optimizer_2.optimizer.server_errors) == 0, f"Incorrect server error"
+        # Test whether scaling_coeffs is loaded correctly
+        scaling_coeff_2 = []
+        for v in optimizer_2.state.values():
+            assert "scaling_coeff" in v, f"Incorrect scaling_coeff"
+            scaling_coeff_2.append(v["scaling_coeff"])
+        assert list(sorted(scaling_coeff_2)) == list(
+            sorted(scaling_coeff_1)
+        ), f"Incorrect scaling_coeffs"
+        assert optimizer_2.optimizer.lamb_freeze_key is True
+
+        model_3, optimizer_3, _, _ = deepspeed.initialize(
+            config=config_dict,
+            model=model,
+            model_parameters=optimizer_grouped_parameters_3,
+        )
+        optimizer_3.optimizer.freeze_step = 20
+        data_loader = random_dataloader(
+            model=model_3,
+            total_samples=50,
+            hidden_dim=hidden_dim,
+            device=model_3.device,
+        )
+        for n, batch in enumerate(data_loader):
+            loss = model_3(batch[0], batch[1])
+            model_3.backward(loss)
+            model_3.step()
+        assert optimizer_3.optimizer.lamb_freeze_key is True
+        # Test whether momentum mask stays the same after loading checkpoint
+        assert (
+            "exp_avg_mask" not in optimizer_3.param_groups[0]
+        ), f"Incorrect momentum mask"
+        model_3.load_checkpoint(
+            save_folder,
+            tag=None,
+            load_optimizer_states=True,
+            load_lr_scheduler_states=True,
+        )
+        assert (
+            "exp_avg_mask" not in optimizer_3.param_groups[0]
+        ), f"Momentum mask should not change after loading checkpoint"
+        # Test whether worker&server error is reset
+        assert len(optimizer_3.optimizer.worker_errors) == 0, f"Incorrect worker error"
+        assert len(optimizer_3.optimizer.server_errors) == 0, f"Incorrect server error"
+        # Test whether scaling_coeffs, lamb_coeff_freeze, last_factor are reset
+        for v in optimizer_3.state.values():
+            assert v["lamb_coeff_freeze"] == 0.0, f"Incorrect lamb_coeff_freeze"
+            assert v["last_factor"] == 1.0, f"Incorrect last_factor"
+            assert "scaling_coeff" not in v, f"Incorrect scaling_coeff"
+        assert optimizer_3.optimizer.lamb_freeze_key is False
+
+    def test_overflow(self, tmpdir):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "OneBitLamb",
+                "params": {
+                    "lr": 0.00015,
+                    "weight_decay": 0.01,
+                    "max_coeff": 0.3,
+                    "min_coeff": 0.01,
+                    "freeze_step": 2,
+                    "cuda_aware": False,
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
+                    "coeff_beta": 0.9,
+                    "factor_max": 1.0,
+                    "factor_min": 0.5,
+                    "factor_threshold": 0.1,
+                },
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 16
+            },
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(
+            config=config_dict, model=model, model_parameters=model.parameters()
+        )
+        data_loader = random_dataloader(model=model,
+                                        total_samples=100,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        save_folder = os.path.join(tmpdir, "saved_checkpoint")
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            if dist.get_rank() == 0 and n >= 10:
+                loss = loss * 1000000.0
+            model.backward(loss)
+            dist.barrier()
+            model.step()
+            dist.barrier()
+            model.save_checkpoint(save_folder, tag=None)
+
+
+@pytest.mark.parametrize(
+    "topo_config",
+    [
+        {
+            "num_pp": 1,
+            "num_dp": 4
+        },
+        {
+            "num_pp": 2,
+            "num_dp": 2
+        },
+        {
+            "num_pp": 4,
+            "num_dp": 1
+        },
+    ],
+)
+class TestOneBitLambFP16Pipeline(DistributedTest):
+    world_size = 4
+
+    def test(self, topo_config):
+        config_dict = {
+            "train_batch_size": 16,
+            "train_micro_batch_size_per_gpu": 4,
+            "steps_per_print": 20,
+            "optimizer": {
+                "type": "OneBitLamb",
+                "params": {
+                    "lr": 0.00001,
+                    "betas": [0.9,
+                              0.999],
+                    "eps": 1e-8,
+                    "weight_decay": 3e-7,
+                    "freeze_step": 200,
+                    "cuda_aware": False,
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
+                },
+            },
+            "gradient_clipping": 1.0,
+            "zero_optimization": {
+                "stage": 0
+            },
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 16
+            },
+            "pipeline": {
+                "seed_layers": True,
+                "activation_checkpoint_interval": 1
+            },
+        }
+
+        topo = PipeTopo(**topo_config)
+        steps = 500  # Must be >=100
+
+        # Allocate model for consistent initial weights.
+        init_net = AlexNetPipe()
+
+        test_net = copy.deepcopy(init_net)
+        test_model = PipelineModule(layers=test_net.to_layers(),
+                                    topology=topo,
+                                    loss_fn=nn.CrossEntropyLoss())
+
+        test_losses = train_cifar(
+            test_model,
+            config=config_dict,
+            num_steps=steps,
+            fp16=config_dict["fp16"]["enabled"],
+        )
+
+
+@pytest.mark.sequential
+class TestCompressedAllReduceBasic(DistributedTest):
+    world_size = 2
+
+    def test(self, tmpdir):
+        from deepspeed.runtime.comm.nccl import NcclBackend
+
+        size = dist.get_world_size()
+        rank = dist.get_rank()
+        backend = NcclBackend()
+        local_rank = dist.get_rank()
+        device = torch.device(get_accelerator().device_name(), dist.get_rank())
+
+        # A simulated compression function using deepspeed.comm
+        def torch_sim(a):
+            a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
+            scale = a.norm() / np.sqrt(a.numel())
+            a_compressed = scale * a_sign
+            a_sign = None
+            worker_error = a - a_compressed
+            dist.all_reduce(a_compressed)
+            a_compressed.mul_(1 / dist.get_world_size())
+            a_server_sign = (
+                a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
+            a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
+            server_scale = [
+                chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list
+            ]
+            a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
+            a_server_compressed = torch.cat(
+                [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
+            rank = dist.get_rank()
+            server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
+            get_accelerator().synchronize()
+            dist.barrier()
+            return a_server_compressed, worker_error, server_error
+
+        tensor_size = 300 * 2**20
+        server_size = int(tensor_size / size)
+        if tensor_size % (8 * size) != 0:
+            right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
+        else:
+            right_tensor_size = tensor_size
+        right_server_size = right_tensor_size // size
+
+        # Adding bias to the initialization of the gradient we are communicating
+        # In order to get rid of the case where some elements in the gradient are too small
+        a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank
+
+        worker_error = torch.zeros(right_tensor_size, device=device)
+        server_error = torch.zeros(right_server_size, device=device)
+
+        a_torch, worker_error_torch, server_error_torch = torch_sim(a)
+        get_accelerator().empty_cache()
+
+        a_after = backend.compressed_allreduce(a, worker_error, server_error, local_rank)
+
+        threshold = 1e-6
+        magnitude_threshold = 1e-6
+        diff_mask = (a_after - a_torch) > threshold
+        diff_server_mask = torch.chunk(diff_mask, size)[rank]
+        mpi_server = torch.chunk(a_after, size)[rank] + server_error
+        torch_server = torch.chunk(a_torch, size)[rank] + server_error_torch
+
+        # If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic
+        # The test would skip those numbers that are too small in compensated_server_m
+        check_mag_mask = mpi_server[diff_server_mask] > magnitude_threshold
+        if torch.sum(check_mag_mask) != 0:
+            print("Fails at {} of positions".format(torch.sum(check_mag_mask)))
+        assert torch.sum(diff_server_mask) == 0 or torch.sum(check_mag_mask) == 0
diff --git a/tests/unit/runtime/half_precision/test_bf16.py b/tests/unit/runtime/half_precision/test_bf16.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bc5cb138c9b9ab2d8a9df132349b880c06738d5
--- /dev/null
+++ b/tests/unit/runtime/half_precision/test_bf16.py
@@ -0,0 +1,357 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import deepspeed
+import pytest
+from deepspeed.ops.adam import FusedAdam
+from unit.common import DistributedTest
+from deepspeed.ops.op_builder import CPUAdamBuilder
+from unit.simple_model import SimpleModel, SimpleOptimizer, random_dataloader
+from unit.util import bf16_required_version_check
+from deepspeed import comm as dist
+
+
+class TestAdamBF16ZeroOneCycleCompatibility(DistributedTest):
+    world_size = 1
+
+    def test(self, zero_stage=2, use_cpu_offload=False):
+        if not bf16_required_version_check():
+            pytest.skip(
+                " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
+            )
+
+        if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+            pytest.skip("cpu-adam is not compatible")
+
+        config_dict = {
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "scheduler": {
+                "type": "OneCycle",
+                "params": {
+                    "cycle_first_step_size": 16000,
+                    "cycle_first_stair_count": 8000,
+                    "decay_step_size": 16000,
+                    "cycle_min_lr": 1e-06,
+                    "cycle_max_lr": 3e-05,
+                    "decay_lr_rate": 1e-07,
+                    "cycle_min_mom": 0.85,
+                    "cycle_max_mom": 0.99,
+                    "decay_mom_rate": 0.0
+                }
+            },
+            "fp16": {
+                "enabled": False
+            },
+            "bf16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+                "cpu_offload": use_cpu_offload
+            }
+        }
+
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                             model=model,
+                                             model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.bfloat16)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+class TestZeroAllowUntestedOptimizer(DistributedTest):
+    world_size = 1
+
+    def test(self, zero_stage=2, use_cpu_offload=False):
+        if not bf16_required_version_check():
+            pytest.skip(
+                " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
+            )
+
+        if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+            pytest.skip("cpu-adam is not compatible")
+
+        config_dict = {
+            "train_batch_size": 4,
+            "steps_per_print": 1,
+            "fp16": {
+                "enabled": False,
+            },
+            "bf16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+                "cpu_offload": use_cpu_offload
+            },
+            "zero_allow_untested_optimizer": False
+        }
+
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+        optimizer = SimpleOptimizer(model.parameters())
+        with pytest.raises(AssertionError):
+            model, optim, _, _ = deepspeed.initialize(config=config_dict,
+                                                      model=model,
+                                                      optimizer=optimizer,
+                                                      model_parameters=model.parameters())
+
+
+class TestZeroEmptyPartition(DistributedTest):
+    world_size = 3
+
+    def test(self, zero_stage=2, use_cpu_offload=False):
+        if not bf16_required_version_check():
+            pytest.skip(
+                " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
+            )
+
+        if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+            pytest.skip("cpu-adam is not compatible")
+
+        if zero_stage == 3:
+            pytest.skip("skip for now")
+
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "gradient_accumulation_steps": 1,
+            "fp16": {
+                "enabled": False
+            },
+            "bf16": {
+                "enabled": True
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+                "cpu_offload": use_cpu_offload,
+                "reduce_bucket_size": 100,
+                "allgather_bucket_size": 100
+            }
+        }
+
+        hidden_dim = 1
+        model = SimpleModel(hidden_dim)
+
+        # Ensure model has 2 parameters, to cause empty partition with DP=3
+        assert len(list(model.parameters())) == 2
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+
+        # Now make sure things work..
+        data_loader = random_dataloader(model=model,
+                                        total_samples=1,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.bfloat16)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+@pytest.mark.parametrize("optimizer_constructor", [torch.optim.Adam, FusedAdam])
+class TestZeroSupportedClientOptimizer(DistributedTest):
+    world_size = 1
+
+    def test(self, optimizer_constructor, zero_stage=2):
+        if not bf16_required_version_check():
+            pytest.skip(
+                " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
+            )
+
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "fp16": {
+                "enabled": False
+            },
+            "bf16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": zero_stage
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        client_optimizer = optimizer_constructor(params=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              optimizer=client_optimizer)
+
+
+class TestZero2ReduceScatterOff(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        if not bf16_required_version_check():
+            pytest.skip(
+                " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
+            )
+
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "gradient_clipping": 1.0,
+            "zero_optimization": {
+                "stage": 2,
+                "contiguous_gradients": True,
+                "allgather_bucket_size": 2000000000,
+                "reduce_bucket_size": 200000000,
+                "overlap_comm": False,
+                "reduce_scatter": False
+            },
+            "fp16": {
+                "enabled": False
+            },
+            "bf16": {
+                "enabled": True
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.bfloat16)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+class TestZeroEmptyGrad(DistributedTest):
+    world_size = 1
+
+    def test(self, stage=2):
+        if not bf16_required_version_check():
+            pytest.skip(
+                " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
+            )
+
+        config_dict = {
+            "train_batch_size": 1,
+            "steps_per_print": 1,
+            "fp16": {
+                "enabled": False
+            },
+            "bf16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": stage
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        optimizer = torch.optim.Adam(model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              optimizer=optimizer)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.bfloat16)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+@pytest.mark.parametrize("comp_type",
+                         [torch.float16,
+                          torch.bfloat16,
+                          torch.float],
+                         ids=["fp16",
+                              "bfp16",
+                              "fp32"])
+@pytest.mark.parametrize("comm_type",
+                         [torch.float16,
+                          torch.bfloat16],
+                         ids=["fp16",
+                              "bfp16"])
+class TestZeroDtypeCocktail(DistributedTest):
+    world_size = 2
+
+    def test(self, comp_type, comm_type):
+        if comp_type == torch.bfloat16 or comm_type == torch.bfloat16:
+            if not bf16_required_version_check():
+                pytest.skip(
+                    " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
+                )
+
+        type_str = {torch.float16: "fp16", torch.bfloat16: "bfp16"}
+
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "fp16": {
+                "enabled": comp_type == torch.float16
+            },
+            "bf16": {
+                "enabled": comp_type == torch.bfloat16
+            },
+            "zero_optimization": {
+                "stage": 2
+            },
+            "communication_data_type": type_str[comm_type]
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        optimizer = torch.optim.Adam(model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              optimizer=optimizer)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=2,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=comp_type)
+
+        def custom_reduce(tensor, dst, op=dist.ReduceOp.SUM, group=None, async_op=False):
+            assert tensor.dtype == comm_type
+            return orig_torch_reduce(tensor, dst, op, group, async_op)
+
+        orig_torch_reduce = dist.reduce
+        dist.reduce = custom_reduce
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+        dist.reduce = orig_torch_reduce
diff --git a/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py b/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..3052c4ee117acad6f465e4afa17b10902d454f7a
--- /dev/null
+++ b/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py
@@ -0,0 +1,279 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import deepspeed
+import numpy as np
+from unit.common import DistributedTest
+from unit.simple_model import SimpleModel
+
+
+def run_model_step(model, gradient_list):
+    for value in gradient_list:
+        for p in model.parameters():
+            p.grad = torch.empty_like(p, dtype=p.dtype)
+            p.grad.fill_(value)
+        model.step()
+
+
+class TestFused(DistributedTest):
+    world_size = 1
+
+    def test_no_overflow(self):
+        config_dict = {
+            "train_batch_size": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 8,
+                "loss_scale_window": 2
+            }
+        }
+        hidden_dim = 1
+        model = SimpleModel(hidden_dim)
+        model, optim, _, _ = deepspeed.initialize(config=config_dict,
+                                                  model=model,
+                                                  model_parameters=model.parameters())
+
+        expected_loss_scale = 2**8
+        expected_scale_window = 2
+        # Ensure the dynamic loss scaler is correctly configured.
+        assert optim.dynamic_loss_scale == True
+        assert optim.cur_scale == expected_loss_scale
+        assert optim.scale_window == expected_scale_window
+
+        for i, value in enumerate(np.random.uniform(-0.1, 0.1, 10)):
+            run_model_step(model, [value])
+            assert optim.cur_scale == expected_loss_scale
+            assert optim.cur_iter == (i + 1)
+            if optim.cur_iter % expected_scale_window == 0:
+                expected_loss_scale *= 2
+
+    def test_all_overflow(self):
+        config_dict = {
+            "train_batch_size": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 4,
+                "loss_scale_window": 2
+            }
+        }
+        hidden_dim = 1
+        model = SimpleModel(hidden_dim)
+        model, optim, _, _ = deepspeed.initialize(config=config_dict,
+                                                  model=model,
+                                                  model_parameters=model.parameters())
+
+        expected_loss_scale = 2**4
+        # Ensure the dynamic loss scaler is correctly configured.
+        assert optim.dynamic_loss_scale == True
+        assert optim.cur_scale == expected_loss_scale
+
+        overflow_gradients = [float('inf'), float('-inf')] + [float('nan')] * 6
+        for i, value in enumerate(overflow_gradients):
+            run_model_step(model, [value])
+            expected_loss_scale = max(expected_loss_scale / 2, 1)
+            assert optim.cur_scale == expected_loss_scale
+            assert optim.cur_iter == (i + 1)
+
+    def test_some_overflow(self):
+        config_dict = {
+            "train_batch_size": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 8,
+                "loss_scale_window": 2
+            }
+        }
+        hidden_dim = 1
+        model = SimpleModel(hidden_dim)
+        model, optim, _, _ = deepspeed.initialize(config=config_dict,
+                                                  model=model,
+                                                  model_parameters=model.parameters())
+
+        expected_loss_scale = 2**8
+        expected_scale_window = 2
+        expected_iteration = 0
+        # Ensure the dynamic loss scaler is correctly configured.
+        assert optim.dynamic_loss_scale == True
+        assert optim.cur_scale == expected_loss_scale
+        assert optim.scale_window == expected_scale_window
+
+        # Run model with overflows to decrease scale
+        overflow_gradients = [float('inf'), float('nan')]
+        expected_iteration += len(overflow_gradients)
+        run_model_step(model, overflow_gradients)
+        expected_loss_scale /= (2**len(overflow_gradients))
+        assert optim.cur_scale == expected_loss_scale
+        assert optim.cur_iter == expected_iteration
+
+        # Run model scale_window + 1 times to increase scale once
+        normal_gradients = np.random.uniform(-0.1, 0.1, expected_scale_window + 1)
+        expected_iteration += len(normal_gradients)
+        run_model_step(model, normal_gradients)
+        expected_loss_scale *= 2
+        assert optim.cur_scale == expected_loss_scale
+        assert optim.cur_iter == expected_iteration
+
+        # Run model with overflows to decrease scale
+        overflow_gradients = [float('inf')]
+        expected_iteration += len(overflow_gradients)
+        run_model_step(model, overflow_gradients)
+        expected_loss_scale /= (2**len(overflow_gradients))
+        assert optim.cur_scale == expected_loss_scale
+        assert optim.cur_iter == expected_iteration
+
+
+class TestUnfused(DistributedTest):
+    world_size = 1
+
+    def test_no_overflow(self):
+        config_dict = {
+            "train_batch_size": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Lamb",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 8,
+                "loss_scale_window": 2
+            }
+        }
+        hidden_dim = 1
+        model = SimpleModel(hidden_dim)
+        model, optim, _, _ = deepspeed.initialize(config=config_dict,
+                                                  model=model,
+                                                  model_parameters=model.parameters())
+        expected_loss_scale = 2**8
+        expected_scale_window = 2
+        # Ensure the dynamic loss scaler is correctly configured.
+        assert optim.dynamic_loss_scale == True
+        assert optim.cur_scale == expected_loss_scale
+        assert optim.scale_window == expected_scale_window
+
+        for i, value in enumerate(np.random.uniform(-0.1, 0.1, 10)):
+            run_model_step(model, [value])
+            assert optim.cur_scale == expected_loss_scale
+            assert optim.cur_iter == (i + 1)
+            if optim.cur_iter % expected_scale_window == 0:
+                expected_loss_scale *= 2
+
+    def test_all_overflow(self):
+        config_dict = {
+            "train_batch_size": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Lamb",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 4,
+                "loss_scale_window": 2,
+                "min_loss_scale": 0.25
+            }
+        }
+        hidden_dim = 1
+        model = SimpleModel(hidden_dim)
+        model, optim, _, _ = deepspeed.initialize(config=config_dict,
+                                                  model=model,
+                                                  model_parameters=model.parameters())
+
+        expected_loss_scale = 2**4
+        expected_min_loss_scale = 0.25
+        # Ensure the dynamic loss scaler is correctly configured.
+        assert optim.dynamic_loss_scale == True
+        assert optim.cur_scale == expected_loss_scale
+        assert optim.min_loss_scale == expected_min_loss_scale
+
+        overflow_gradients = [float('inf'), float('-inf')] + [float('nan')] * 6
+        for i, value in enumerate(overflow_gradients):
+            run_model_step(model, [value])
+            expected_loss_scale = max(expected_loss_scale / 2, expected_min_loss_scale)
+            assert optim.cur_scale == expected_loss_scale
+            assert optim.cur_iter == (i + 1)
+
+    def test_some_overflow(self):
+        config_dict = {
+            "train_batch_size": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Lamb",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 8,
+                "loss_scale_window": 2
+            }
+        }
+        hidden_dim = 1
+        model = SimpleModel(hidden_dim)
+        model, optim, _, _ = deepspeed.initialize(config=config_dict,
+                                                  model=model,
+                                                  model_parameters=model.parameters())
+
+        expected_loss_scale = 2**8
+        expected_scale_window = 2
+        expected_iteration = 0
+        # Ensure the dynamic loss scaler is correctly configured.
+        assert optim.dynamic_loss_scale == True
+        assert optim.cur_scale == expected_loss_scale
+        assert optim.scale_window == expected_scale_window
+
+        # Run model with overflows to decrease scale
+        overflow_gradients = [float('inf'), float('nan')]
+        expected_iteration += len(overflow_gradients)
+        run_model_step(model, overflow_gradients)
+        expected_loss_scale /= (2**len(overflow_gradients))
+        assert optim.cur_scale == expected_loss_scale
+        assert optim.cur_iter == expected_iteration
+
+        # Run model scale_window + 1 times to increase scale once
+        normal_gradients = np.random.uniform(-0.1, 0.1, expected_scale_window + 1)
+        expected_iteration += len(normal_gradients)
+        run_model_step(model, normal_gradients)
+        expected_loss_scale *= 2
+        assert optim.cur_scale == expected_loss_scale
+        assert optim.cur_iter == expected_iteration
+
+        # Run model with overflows to decrease scale
+        overflow_gradients = [float('inf')]
+        expected_iteration += len(overflow_gradients)
+        run_model_step(model, overflow_gradients)
+        expected_loss_scale /= (2**len(overflow_gradients))
+        assert optim.cur_scale == expected_loss_scale
+        assert optim.cur_iter == expected_iteration
diff --git a/tests/unit/runtime/half_precision/test_fp16.py b/tests/unit/runtime/half_precision/test_fp16.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3c933fca144ced19bd0d9bff6dc032b4a22183d
--- /dev/null
+++ b/tests/unit/runtime/half_precision/test_fp16.py
@@ -0,0 +1,829 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import deepspeed.comm as dist
+import deepspeed
+import pytest
+from deepspeed.ops.adam import FusedAdam
+from unit.common import DistributedTest
+from unit.simple_model import SimpleModel, SimpleOptimizer, random_dataloader, SimpleMoEModel, sequence_dataloader
+from unit.util import required_torch_version
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import CPUAdamBuilder
+
+try:
+    from apex import amp  # noqa: F401
+    _amp_available = True
+except ImportError:
+    _amp_available = False
+amp_available = pytest.mark.skipif(not _amp_available,
+                                   reason="apex/amp is not installed")
+
+
+class TestLambFP32GradClip(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Lamb",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "gradient_clipping": 1.0
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+class TestLambFP16(DistributedTest):
+    world_size = 2
+
+    def test__basic(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Lamb",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": True
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    def test_empty_grad(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Lamb",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": True
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim, empty_grad=True)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+class TestAdamFP32EmptyGrad(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": False
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim, empty_grad=True)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+class TestAdamwFP16Basic(DistributedTest):
+    world_size = 1
+
+    def test(self):
+        config_dict = {
+            "train_batch_size": 1,
+            "steps_per_print": 1,
+            "fp16": {
+                "enabled": True
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        optimizer = torch.optim.AdamW(params=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              optimizer=optimizer)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+class TestFP16OptimizerForMoE(DistributedTest):
+    world_size = 2
+
+    def test_unfused_gradnorm(self, monkeypatch):
+        if not required_torch_version():
+            pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
+
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "fp16": {
+                "enabled": True
+            }
+        }
+        hidden_dim = 10
+
+        def mock_unscale_and_clip_grads(total_norm, apply_scale=True):
+            torch_norm_tensor = get_accelerator().FloatTensor([total_norm])
+            all_gather_results = [
+                torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
+            ]
+            dist.all_gather(all_gather_results, torch_norm_tensor)
+            assert len(set([x.item() for x in all_gather_results])) == 1
+            return 1.0
+
+        # initialize MoE
+        model = SimpleMoEModel(hidden_dim, ep_size=2)
+        optimizer = torch.optim.AdamW(params=model.parameters())
+        engine, optimizer, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              optimizer=optimizer,
+                                              dist_init_required=False)
+        monkeypatch.setattr(optimizer,
+                            'unscale_and_clip_grads',
+                            mock_unscale_and_clip_grads)
+        data_loader = sequence_dataloader(model=engine,
+                                          total_samples=50,
+                                          hidden_dim=hidden_dim,
+                                          device=engine.device)
+        for n, batch in enumerate(data_loader):
+            loss = engine(batch[0], batch[1])
+            engine.backward(loss)
+            engine.step()
+
+    def test_fused_gradnorm(self, monkeypatch):
+        if not required_torch_version():
+            pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
+
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "fp16": {
+                "enabled": True
+            }
+        }
+        hidden_dim = 10
+
+        def mock_unscale_and_clip_grads(grads_groups_flat, total_norm, apply_scale=True):
+            torch_norm_tensor = get_accelerator().FloatTensor([total_norm])
+            all_gather_results = [
+                torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
+            ]
+            dist.all_gather(all_gather_results, torch_norm_tensor)
+            assert len(set([x.item() for x in all_gather_results])) == 1
+            return 1.0
+
+        # initialize MoE
+        model = SimpleMoEModel(hidden_dim, ep_size=2)
+        # optimizer = torch.optim.AdamW(params=model.parameters())
+        optimizer = FusedAdam(params=model.parameters())
+        engine, optimizer, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              optimizer=optimizer,
+                                              dist_init_required=False)
+        monkeypatch.setattr(optimizer,
+                            'unscale_and_clip_grads',
+                            mock_unscale_and_clip_grads)
+        data_loader = sequence_dataloader(model=engine,
+                                          total_samples=50,
+                                          hidden_dim=hidden_dim,
+                                          device=engine.device)
+        for n, batch in enumerate(data_loader):
+            loss = engine(batch[0], batch[1])
+            engine.backward(loss)
+            engine.step()
+
+    @pytest.mark.parametrize("fused_lamb_legacy", [(False), (True)])
+    def test_lamb_gradnorm(self, monkeypatch, fused_lamb_legacy: bool):
+        if not required_torch_version():
+            pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
+
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "fp16": {
+                "enabled": True
+            },
+            "optimizer": {
+                "type": "Lamb",
+                "params": {
+                    "lr": 0.00015
+                }
+            }
+        }
+        hidden_dim = 10
+
+        def mock_unscale_and_clip_grads(total_norm, apply_scale=True):
+            torch_norm_tensor = get_accelerator().FloatTensor([total_norm])
+            all_gather_results = [
+                torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
+            ]
+            dist.all_gather(all_gather_results, torch_norm_tensor)
+            assert len(set([x.item() for x in all_gather_results])) == 1
+            return 1.0
+
+        # initialize MoE
+        model = SimpleMoEModel(hidden_dim, ep_size=2)
+        engine, optimizer, _, _ = deepspeed.initialize(config=config_dict,
+                                               model=model,
+                                               model_parameters=model.parameters(),
+                                               dist_init_required=False)
+        monkeypatch.setattr(optimizer,
+                            'unscale_and_clip_grads',
+                            mock_unscale_and_clip_grads)
+        optimizer.fused_lamb_legacy = fused_lamb_legacy
+        data_loader = sequence_dataloader(model=engine,
+                                          total_samples=50,
+                                          hidden_dim=hidden_dim,
+                                          device=engine.device)
+        for n, batch in enumerate(data_loader):
+            loss = engine(batch[0], batch[1])
+            engine.backward(loss)
+            engine.step()
+
+
+class TestAdamwFP16EmptyGrad(DistributedTest):
+    world_size = 1
+
+    def test(self):
+        config_dict = {
+            "train_batch_size": 1,
+            "steps_per_print": 1,
+            "fp16": {
+                "enabled": True
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        optimizer = torch.optim.AdamW(params=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              optimizer=optimizer)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+@pytest.mark.parametrize("zero_stage", [1, 2, 3])
+@pytest.mark.parametrize("use_cpu_offload", [True, False])
+class TestAdamFP16ZeroOneCycleCompatibility(DistributedTest):
+    world_size = 1
+
+    def test(self, zero_stage, use_cpu_offload):
+        if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+            pytest.skip("cpu-adam is not compatible")
+
+        config_dict = {
+            "train_batch_size": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "scheduler": {
+                "type": "OneCycle",
+                "params": {
+                    "cycle_first_step_size": 16000,
+                    "cycle_first_stair_count": 8000,
+                    "decay_step_size": 16000,
+                    "cycle_min_lr": 1e-06,
+                    "cycle_max_lr": 3e-05,
+                    "decay_lr_rate": 1e-07,
+                    "cycle_min_mom": 0.85,
+                    "cycle_max_mom": 0.99,
+                    "decay_mom_rate": 0.0
+                }
+            },
+            "fp16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+                "cpu_offload": use_cpu_offload
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        model, _, _,_ = deepspeed.initialize(config=config_dict,
+                                             model=model,
+                                             model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+@pytest.mark.parametrize("zero_stage", [1, 2, 3])
+@pytest.mark.parametrize("use_cpu_offload", [True, False])
+@pytest.mark.parametrize("hidden_dim", [9, 10])
+class TestZeroStaticScale(DistributedTest):
+    world_size = 1
+
+    def test(self, zero_stage, use_cpu_offload, hidden_dim):
+        if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+            pytest.skip("cpu-adam is not compatible")
+
+        config_dict = {
+            "train_batch_size": 4,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 138.
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+                "cpu_offload": use_cpu_offload
+            }
+        }
+
+        model = SimpleModel(hidden_dim)
+        model, optim, _, _ = deepspeed.initialize(config=config_dict,
+                                            model=model,
+                                            model_parameters=model.parameters())
+
+        # Ensure the static scaler is configured.
+        assert optim.dynamic_loss_scale == False
+        assert optim.loss_scaler.loss_scale == 138.
+
+        # Now make sure things work..
+        data_loader = random_dataloader(model=model,
+                                        total_samples=10,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+@pytest.mark.parametrize("zero_stage", [1, 2, 3])
+@pytest.mark.parametrize("use_cpu_offload", [True, False])
+class TestZeroAllowUntestedOptimizer(DistributedTest):
+    world_size = 1
+
+    def test(self, zero_stage, use_cpu_offload):
+        if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+            pytest.skip("cpu-adam is not compatible")
+
+        config_dict = {
+            "train_batch_size": 4,
+            "steps_per_print": 1,
+            "fp16": {
+                "enabled": True,
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+                "cpu_offload": use_cpu_offload
+            },
+            "zero_allow_untested_optimizer": False
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        optimizer = SimpleOptimizer(model.parameters())
+        with pytest.raises(AssertionError):
+            model, optim, _, _ = deepspeed.initialize(config=config_dict,
+                                                      model=model,
+                                                      optimizer=optimizer,
+                                                      model_parameters=model.parameters())
+
+
+@pytest.mark.parametrize("zero_stage", [1, 2, 3])
+@pytest.mark.parametrize("use_cpu_offload", [True, False])
+class TestZeroEmptyPartition(DistributedTest):
+    world_size = 3
+
+    def test(self, zero_stage, use_cpu_offload):
+        if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+            pytest.skip("cpu-adam is not compatible")
+
+        if zero_stage == 3:
+            pytest.skip("skip for now")
+
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "gradient_accumulation_steps": 1,
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+                "cpu_offload": use_cpu_offload,
+                "reduce_bucket_size": 100,
+                "allgather_bucket_size": 100
+            }
+        }
+        hidden_dim = 1
+        model = SimpleModel(hidden_dim)
+
+        # Ensure model has 2 parameters, to cause empty partition with DP=3
+        assert len(list(model.parameters())) == 2
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+
+        # Now make sure things work..
+        data_loader = random_dataloader(model=model,
+                                        total_samples=1,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+@amp_available
+class TestAmp(DistributedTest):
+    world_size = 2
+
+    def test_adam_basic(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "amp": {
+                "enabled": True
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        optimizer = torch.optim.Adam(params=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              optimizer=optimizer)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    def test_lamb_basic(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Lamb",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "gradient_clipping": 1.0,
+            "amp": {
+                "enabled": True,
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    def test_adam_O2(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "gradient_clipping": 1.0,
+            "amp": {
+                "enabled": True,
+                "opt_level": "O2"
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    def test_adam_O2_empty_grad(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "gradient_clipping": 1.0,
+            "amp": {
+                "enabled": True,
+                "opt_level": "O2"
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+@pytest.mark.parametrize("zero_stage", [1, 2, 3])
+@pytest.mark.parametrize("optimizer_constructor", [FusedAdam, torch.optim.Adam])
+class TestZeroSupportedClientOptimizer(DistributedTest):
+    world_size = 1
+
+    def test(self, zero_stage, optimizer_constructor):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "fp16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": zero_stage
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        client_optimizer = optimizer_constructor(params=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              optimizer=client_optimizer)
+
+
+class TestZero2ReduceScatterOff(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "gradient_clipping": 1.0,
+            "zero_optimization": {
+                "stage": 2,
+                "contiguous_gradients": True,
+                "allgather_bucket_size": 2000000000,
+                "reduce_bucket_size": 200000000,
+                "overlap_comm": False,
+                "reduce_scatter": False
+            },
+            "fp16": {
+                "enabled": True
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+@pytest.mark.parametrize("adam_type", ["Adam", "AdamW"])
+@pytest.mark.parametrize("torch_impl", [True, False])
+class TestFP16AdamTypes(DistributedTest):
+    world_size = 1
+
+    def test(self, adam_type, torch_impl):
+        config_dict = {
+            "train_batch_size": 1,
+            "steps_per_print": 1,
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 10
+            },
+            "optimizer": {
+                "type": adam_type,
+                "torch_adam": torch_impl,
+                "params": {
+                    "lr": 0.00015
+                }
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=10,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+
+        for _, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+class TestZero3LazyScatter(DistributedTest):
+    world_size = 1
+
+    def test(self):
+        config_dict = {
+            "train_batch_size": 1,
+            "steps_per_print": 1,
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 10
+            },
+            "optimizer": {
+                "type": "AdamW",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "zero_optimization": {
+                "stage": 3
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=10,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+
+        for _, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+@pytest.mark.parametrize('stage', [1, 2, 3])
+class TestZeroEmptyGrad(DistributedTest):
+    world_size = 1
+
+    def test(self, stage):
+        config_dict = {
+            "train_batch_size": 1,
+            "steps_per_print": 1,
+            "fp16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": stage
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        optimizer = torch.optim.Adam(model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              optimizer=optimizer)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
diff --git a/tests/unit/runtime/pipe/test_pipe.py b/tests/unit/runtime/pipe/test_pipe.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c4d3aef1f636a55c3866db1ab5a083597f3ac24
--- /dev/null
+++ b/tests/unit/runtime/pipe/test_pipe.py
@@ -0,0 +1,119 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import copy
+import torch.nn as nn
+import pytest
+
+import deepspeed.comm as dist
+from deepspeed.runtime.pipe.topology import PipeDataParallelTopology
+from deepspeed.runtime.pipe.module import PipelineModule
+from unit.alexnet_model import AlexNetPipe, train_cifar
+from unit.common import DistributedTest
+
+PipeTopo = PipeDataParallelTopology
+
+
+def rel_diff(A, B):
+    return abs(A - B) / abs(A)
+
+
+@pytest.mark.parametrize('topo_config',
+                         [
+                             {
+                                 "num_pp": 1,
+                                 "num_dp": 4
+                             },
+                             {
+                                 "num_pp": 2,
+                                 "num_dp": 2
+                             },
+                             {
+                                 "num_pp": 4,
+                                 "num_dp": 1
+                             },
+                         ])
+class TestPipeCifar10(DistributedTest):
+    world_size = 4
+
+    def test(self, topo_config):
+        config_dict = {
+            "train_batch_size": 16,
+            "train_micro_batch_size_per_gpu": 4,
+            "steps_per_print": 20,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.001,
+                    "betas": [0.9,
+                              0.999],
+                    "eps": 1e-8,
+                    "weight_decay": 3e-7
+                }
+            },
+            "zero_optimization": {
+                "stage": 0
+            },
+            "fp16": {
+                "enabled": False
+            },
+            "pipeline": {
+                "seed_layers": True,
+                "activation_checkpoint_interval": 1
+            }
+        }
+
+        topo = PipeTopo(**topo_config)
+        steps = 500  # must be >=100
+
+        # Allocate model for consistent initial weights.
+        init_net = AlexNetPipe()
+
+        base_net = copy.deepcopy(init_net)
+        base_model = PipelineModule(layers=base_net.to_layers(),
+                                    num_stages=1,
+                                    loss_fn=nn.CrossEntropyLoss())
+
+        # Train with just data parallelism
+        base_losses = train_cifar(base_model,
+                                  config=config_dict,
+                                  num_steps=steps,
+                                  fp16=config_dict['fp16']['enabled'])
+
+        test_net = copy.deepcopy(init_net)
+        test_model = PipelineModule(layers=test_net.to_layers(),
+                                    topology=topo,
+                                    loss_fn=nn.CrossEntropyLoss())
+
+        test_losses = train_cifar(test_model,
+                                  config=config_dict,
+                                  num_steps=steps,
+                                  fp16=config_dict['fp16']['enabled'])
+
+        abs_diffs = [l0 - l1 for l0, l1 in zip(base_losses, test_losses)]
+        rel_diffs = [rel_diff(l0, l1) for l0, l1 in zip(base_losses, test_losses)]
+        if dist.get_rank() == 0:
+            print(
+                f'abs min={min(abs_diffs)} max={max(abs_diffs)} avg={sum(abs_diffs)/len(abs_diffs)}'
+            )
+            print(
+                f'rel min={min(rel_diffs)} max={max(rel_diffs)} avg={sum(rel_diffs)/len(rel_diffs)}'
+            )
+            print(
+                f'first: base={base_losses[0]} test={test_losses[0]} abs={abs_diffs[0]} rel={rel_diffs[0]}'
+            )
+
+            for lastX in [1, 10, 100]:
+                base_avg = sum(base_losses[-lastX:]) / lastX
+                test_avg = sum(test_losses[-lastX:]) / lastX
+                print(
+                    f'last-{lastX}: base={base_avg} test={test_avg} abs={base_avg - test_avg} rel={rel_diff(base_avg, test_avg)}'
+                )
+
+        lastX = 100
+        base = base_losses[-lastX:]
+        base_avg = sum(base) / len(base)
+        test = test_losses[-lastX:]
+        test_avg = sum(test) / len(test)
+        assert rel_diff(
+            base_avg,
+            test_avg) < 0.05  # Originally 0.03, but seeing instability with AMD results
diff --git a/tests/unit/runtime/pipe/test_pipe_schedule.py b/tests/unit/runtime/pipe/test_pipe_schedule.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ca3dfe1d2a0cf5b82752fd669f59349ed2dc1ad
--- /dev/null
+++ b/tests/unit/runtime/pipe/test_pipe_schedule.py
@@ -0,0 +1,146 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import pytest
+import deepspeed.runtime.pipe.schedule as schedule
+
+
+def _count_type(cmds, classtype):
+    return len(list(filter(lambda c: type(c) == classtype, cmds)))
+
+
+def test_pipe_inference_schedule_singlestage():
+    sched = schedule.InferenceSchedule(micro_batches=4, stages=1, stage_id=0)
+    assert sched.num_micro_batches == 4
+    full = list(iter(sched))
+    for idx, cmds in enumerate(full):
+        assert len(cmds) == 2
+        assert type(cmds[0]) == schedule.LoadMicroBatch
+        assert type(cmds[1]) == schedule.ForwardPass
+        assert cmds[0].buffer_id == cmds[1].buffer_id
+    assert len(full) == sched.num_micro_batches
+
+
+def test_pipe_train_schedule_singlestage():
+    sched = schedule.TrainSchedule(micro_batches=4, stages=1, stage_id=0)
+    assert sched.num_micro_batches == 4
+    full = list(iter(sched))
+    for idx, cmds in enumerate(full):
+        if (idx % 2) != 0:
+            assert (len(cmds) == 1) or (len(cmds) == 4)
+            assert type(cmds[0]) == schedule.BackwardPass
+        else:
+            assert len(cmds) == 2
+            assert type(cmds[0]) == schedule.LoadMicroBatch
+            assert type(cmds[1]) == schedule.ForwardPass
+            assert cmds[0].buffer_id == cmds[1].buffer_id
+    assert len(full) == sched.num_micro_batches * 2
+
+
+@pytest.mark.parametrize('micro_batches', [1, 3, 8, 10])
+def test_pipe_inference_schedule_firststage(micro_batches, stages=3):
+    sched = schedule.InferenceSchedule(micro_batches=micro_batches,
+                                       stages=stages,
+                                       stage_id=0)
+    assert sched.num_micro_batches == micro_batches
+    full = list(iter(sched))
+    for idx, cmds in enumerate(full):
+        # Ensure we don't send an activation the first step
+        if idx == 0:
+            assert len(cmds) == 2
+            assert type(cmds[0]) == schedule.LoadMicroBatch
+            assert type(cmds[1]) == schedule.ForwardPass
+            assert cmds[0].buffer_id == cmds[1].buffer_id
+            continue
+
+        # the last active step is only a send
+        if idx == sched.num_micro_batches:
+            assert len(cmds) == 1
+            assert type(cmds[0]) == schedule.SendActivation
+            continue
+
+        # no work later on
+        if idx > sched.num_micro_batches:
+            assert len(cmds) == 0
+            continue
+
+        # Normally we need to load/forward/send
+        assert len(cmds) == 3
+        assert _count_type(cmds, schedule.LoadMicroBatch) == 1
+        assert _count_type(cmds, schedule.ForwardPass) == 1
+        assert _count_type(cmds, schedule.SendActivation) == 1
+    assert len(full) == micro_batches + stages - 1
+
+
+@pytest.mark.parametrize('micro_batches', [1, 3, 8, 10])
+def test_pipe_inference_schedule_midstage(micro_batches, stages=3):
+    sched = schedule.InferenceSchedule(micro_batches=micro_batches,
+                                       stages=stages,
+                                       stage_id=1)
+
+    full = list(iter(sched))
+    for idx, cmds in enumerate(full):
+        if idx < sched.stage:
+            assert len(cmds) == 0
+            continue
+        if idx == sched.stage + sched.num_micro_batches:
+            assert len(cmds) == 1
+            assert type(cmds[0]) == schedule.SendActivation
+            continue
+        if idx > sched.stage + sched.num_micro_batches:
+            assert len(cmds) == 0
+            continue
+        assert _count_type(cmds, schedule.LoadMicroBatch) == 0
+        assert _count_type(cmds, schedule.ForwardPass) == 1
+        assert _count_type(cmds, schedule.RecvActivation) == 1
+        if idx > sched.stage:
+            assert _count_type(cmds, schedule.SendActivation) == 1
+    assert len(full) == micro_batches + stages - 1
+
+
+@pytest.mark.parametrize('micro_batches', [1, 3, 8, 10])
+def test_pipe_inference_schedule_laststage(micro_batches, stages=3):
+    sched = schedule.InferenceSchedule(micro_batches=micro_batches,
+                                       stages=stages,
+                                       stage_id=2)
+    full = list(iter(sched))
+    for idx, cmds in enumerate(full):
+        if idx < sched.stage or idx > sched.stage + sched.num_micro_batches:
+            assert len(cmds) == 0
+            continue
+        assert _count_type(cmds, schedule.LoadMicroBatch) == 1
+        assert _count_type(cmds, schedule.ForwardPass) == 1
+        assert _count_type(cmds, schedule.RecvActivation) == 1
+        assert _count_type(cmds, schedule.SendActivation) == 0
+    assert len(full) == micro_batches + stages - 1
+
+
+def test_pipe_schedule_firststage():
+    sched = schedule.TrainSchedule(micro_batches=8, stages=3, stage_id=0)
+    for cmds in sched:
+        assert all(instr.__class__ != schedule.SendGrad for instr in cmds)
+        assert all(instr.__class__ != schedule.RecvActivation for instr in cmds)
+        for instr in cmds:
+            if isinstance(instr, schedule.BufferOpInstruction):
+                assert 0 <= instr.buffer_id < sched.num_pipe_buffers()
+
+
+def test_pipe_schedule_laststage():
+    sched = schedule.TrainSchedule(stages=3, micro_batches=4, stage_id=2)
+    assert len(list(iter(sched))) == 2 * (sched.micro_batches + sched.stages - 1)
+    for cmds in sched:
+        assert all(instr.__class__ != schedule.SendActivation for instr in cmds)
+        assert all(instr.__class__ != schedule.RecvGrad for instr in cmds)
+
+
+def test_pipe_stagequery():
+    sched = schedule.TrainSchedule(stages=3, micro_batches=4, stage_id=0)
+    assert sched.is_first_stage
+    assert not sched.is_last_stage
+
+    sched = schedule.TrainSchedule(stages=3, micro_batches=4, stage_id=1)
+    assert not sched.is_first_stage
+    assert not sched.is_last_stage
+
+    sched = schedule.TrainSchedule(stages=3, micro_batches=4, stage_id=2)
+    assert not sched.is_first_stage
+    assert sched.is_last_stage
diff --git a/tests/unit/runtime/pipe/test_topology.py b/tests/unit/runtime/pipe/test_topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b0cc42d4336b29c182bcce4e0d4dbb25b80f3bc
--- /dev/null
+++ b/tests/unit/runtime/pipe/test_topology.py
@@ -0,0 +1,225 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import pytest
+
+import torch
+import deepspeed.comm as dist
+
+from deepspeed.runtime.pipe.topology import PipelineParallelGrid as Grid
+from deepspeed.runtime.pipe.topology import ProcessTopology as Topo
+from deepspeed.runtime.pipe.topology import _prime_factors
+
+from deepspeed.accelerator import get_accelerator
+from unit.common import DistributedTest
+
+
+def test_topology_2d():
+    topo = Topo(axes=['row', 'col'], dims=[2, 2])
+
+    assert topo.world_size() == 4
+
+    assert topo.get_rank(row=0, col=0) == 0
+    assert topo.get_rank(row=0, col=1) == 1
+    assert topo.get_rank(row=1, col=0) == 2
+    assert topo.get_rank(row=1, col=1) == 3
+
+    assert topo.get_axis_list(axis='row', idx=0) == [0, 1]
+    assert topo.get_axis_list(axis='row', idx=1) == [2, 3]
+    assert topo.get_axis_list(axis='col', idx=0) == [0, 2]
+    assert topo.get_axis_list(axis='col', idx=1) == [1, 3]
+
+
+def test_topology_dims():
+    topo = Topo(axes=['a', 'b', 'c'], dims=[2, 3, 4])
+    assert topo.world_size() == 24
+    assert topo.get_dim('a') == 2
+    assert topo.get_dim('b') == 3
+    assert topo.get_dim('c') == 4
+
+
+def test_topology_match():
+    topo = Topo(axes=['pipe', 'data', 'model'], dims=[2, 2, 2])
+    print(topo.filter_match(pipe=0, data=1))
+    assert topo.filter_match(pipe=0, data=1) == [2, 3]
+    print([topo.get_coord(r) for r in topo.filter_match(pipe=0, data=1)])
+
+
+def test_topology_rank_repr():
+    topo = Topo(axes=['a', 'b'], dims=[2, 2])
+    assert topo.get_rank_repr(rank=0) == 'a_00-b_00'
+    assert topo.get_rank_repr(rank=1) == 'a_00-b_01'
+    assert topo.get_rank_repr(rank=2) == 'a_01-b_00'
+    assert topo.get_rank_repr(rank=3) == 'a_01-b_01'
+
+    assert topo.get_rank_repr(rank=3, inner_sep='+') == 'a+01-b+01'
+    assert topo.get_rank_repr(rank=3,
+                              inner_sep='🤗',
+                              outer_sep='_JEFF_') == 'a🤗01_JEFF_b🤗01'
+
+    topo = Topo(axes=['pipe', 'data'], dims=[2, 2])
+    assert topo.get_rank_repr(rank=0) == ''
+    assert topo.get_rank_repr(rank=1) == ''
+    assert topo.get_rank_repr(rank=2) == ''
+    assert topo.get_rank_repr(rank=3) == ''
+
+    assert topo.get_rank_repr(rank=0, omit_axes=['pipe']) == 'data_00'
+    assert topo.get_rank_repr(rank=1, omit_axes=['pipe']) == 'data_01'
+    assert topo.get_rank_repr(rank=2, omit_axes=['pipe']) == 'data_00'
+    assert topo.get_rank_repr(rank=3, omit_axes=['pipe']) == 'data_01'
+
+    assert topo.get_rank_repr(rank=0, omit_axes=[]) == 'pipe_00-data_00'
+    assert topo.get_rank_repr(rank=1, omit_axes=[]) == 'pipe_00-data_01'
+    assert topo.get_rank_repr(rank=2, omit_axes=[]) == 'pipe_01-data_00'
+    assert topo.get_rank_repr(rank=3, omit_axes=[]) == 'pipe_01-data_01'
+
+    topo = Topo(axes=['pipe', 'data', 'model'], dims=[2, 2, 2])
+    assert topo.get_rank_repr(rank=0) == 'model_00'
+    assert topo.get_rank_repr(rank=1) == 'model_01'
+    assert topo.get_rank_repr(rank=2) == 'model_00'
+    assert topo.get_rank_repr(rank=3) == 'model_01'
+    assert topo.get_rank_repr(rank=4) == 'model_00'
+    assert topo.get_rank_repr(rank=5) == 'model_01'
+    assert topo.get_rank_repr(rank=6) == 'model_00'
+    assert topo.get_rank_repr(rank=7) == 'model_01'
+
+
+def test_topology_3d():
+    topo = Topo(axes=['a', 'b', 'c'], dims=[2, 2, 2])
+
+    assert topo.get_rank(a=0, b=0, c=0) == 0
+    assert topo.get_rank(a=0, b=0, c=1) == 1
+    assert topo.get_rank(a=0, b=1, c=0) == 2
+    assert topo.get_rank(a=0, b=1, c=1) == 3
+    assert topo.get_rank(a=1, b=0, c=0) == 4
+    assert topo.get_rank(a=1, b=0, c=1) == 5
+    assert topo.get_rank(a=1, b=1, c=0) == 6
+    assert topo.get_rank(a=1, b=1, c=1) == 7
+
+    assert topo.get_axis_list('a', 0) == [0, 1, 2, 3]
+    assert topo.get_axis_list('a', 1) == [4, 5, 6, 7]
+    assert topo.get_axis_list('b', 0) == [0, 1, 4, 5]
+    assert topo.get_axis_list('b', 1) == [2, 3, 6, 7]
+    assert topo.get_axis_list('c', 0) == [0, 2, 4, 6]
+    assert topo.get_axis_list('c', 1) == [1, 3, 5, 7]
+
+    assert topo.get_coord(0) == topo.ProcessCoord(0, 0, 0)
+    assert topo.get_coord(1) == topo.ProcessCoord(0, 0, 1)
+    assert topo.get_coord(2) == topo.ProcessCoord(0, 1, 0)
+    assert topo.get_coord(3) == topo.ProcessCoord(0, 1, 1)
+    assert topo.get_coord(4) == topo.ProcessCoord(1, 0, 0)
+    assert topo.get_coord(5) == topo.ProcessCoord(1, 0, 1)
+    assert topo.get_coord(6) == topo.ProcessCoord(1, 1, 0)
+    assert topo.get_coord(7) == topo.ProcessCoord(1, 1, 1)
+
+    assert topo.filter_match(a=0) == [0, 1, 2, 3]
+    assert topo.filter_match(b=1, c=1) == [3, 7]
+    assert topo.filter_match(a=1, b=1, c=1) == [7]
+
+    # Easy access method
+    assert topo.get_coord(0).a == 0
+
+
+def test_topology_comm_list():
+    topo = Topo(axes=['pipe', 'data', 'model'], dims=[2, 2, 2])
+
+    assert topo.get_rank(pipe=0, data=0, model=0) == 0
+    assert topo.get_rank(pipe=0, data=0, model=1) == 1
+    assert topo.get_rank(pipe=0, data=1, model=0) == 2
+    assert topo.get_rank(pipe=0, data=1, model=1) == 3
+    assert topo.get_rank(pipe=1, data=0, model=0) == 4
+    assert topo.get_rank(pipe=1, data=0, model=1) == 5
+    assert topo.get_rank(pipe=1, data=1, model=0) == 6
+    assert topo.get_rank(pipe=1, data=1, model=1) == 7
+
+    pipe_list = [
+        [0, 4], # data=0, model=0
+        [1, 5], # data=0, model=1
+        [2, 6], # data=1, model=0
+        [3, 7], # data=1, model=1
+    ]
+    assert topo.get_axis_comm_lists('pipe') == pipe_list
+
+    data_list = [
+        [0, 2], # pipe=0, model=0
+        [1, 3], # pipe=0, model=1
+        [4, 6], # pipe=1, model=0
+        [5, 7], # pipe=1, model=1
+    ]
+    assert topo.get_axis_comm_lists('data') == data_list
+
+    model_list = [
+        [0, 1], # pipe=0, data=0
+        [2, 3], # pipe=0, data=1
+        [4, 5], # pipe=1, data=0
+        [6, 7], # pipe=1, data=1
+    ]
+    assert topo.get_axis_comm_lists('model') == model_list
+
+    # Handle nonsense. We don't want to RuntimeError because it allows us to write more
+    # generalized code for data/model/pipe parallelism
+    assert topo.get_axis_comm_lists('jeff') == []
+
+
+class TestDistributedTopology(DistributedTest):
+    world_size = 4
+
+    def test_grid_pipe_data(self):
+        topo = Topo(axes=['pipe', 'data'], dims=[2, 2])
+        grid = Grid(topology=topo)
+
+        assert grid._is_grid_valid()
+
+        rank = dist.get_rank()
+
+        assert grid.is_first_stage == (grid.get_stage_id() == 0)
+        assert grid.is_last_stage == (
+            grid.get_stage_id() == grid.get_pipe_parallel_world_size() - 1)
+
+        # Test collectives along the pipeline parallel process groups
+        rank_tensor = torch.LongTensor(data=[rank]).to(get_accelerator().device_name())
+        dist.all_reduce(rank_tensor, group=grid.get_pipe_parallel_group())
+        pipe_group = grid.pp_group
+        assert torch.all(rank_tensor == sum(pipe_group))
+
+        # Test collectives along the data parallel process groups
+        rank_tensor = torch.LongTensor(data=[rank]).to(get_accelerator().device_name())
+        dist.all_reduce(rank_tensor, group=grid.get_data_parallel_group())
+        data_group = grid.dp_group
+        assert torch.all(rank_tensor == sum(data_group))
+
+    def test_stage_to_global(self):
+        topo = Topo(axes=['pipe', 'data'], dims=[2, 2])
+        grid = Grid(topology=topo)
+
+        assert grid._is_grid_valid()
+
+        assert grid.stage_to_global(stage_id=0, data=0) == 0
+        assert grid.stage_to_global(stage_id=0, data=1) == 1
+        assert grid.stage_to_global(stage_id=1, data=0) == 2
+        assert grid.stage_to_global(stage_id=1, data=1) == 3
+
+        me = topo.get_coord(rank=dist.get_rank())
+        if me.data == 0:
+            assert grid.stage_to_global(stage_id=0) == 0
+            assert grid.stage_to_global(stage_id=1) == 2
+        else:
+            assert grid.stage_to_global(stage_id=0) == 1
+            assert grid.stage_to_global(stage_id=1) == 3
+
+
+def test_primes():
+    """ Test prime factorizations. """
+    def _product(ps):
+        p = 1
+        for num in ps:
+            p *= num
+        return p
+
+    with pytest.raises(ValueError):
+        _prime_factors(0)
+
+    for x in range(1, 30):
+        primes = _prime_factors(x)
+        assert _product(primes) == x
+        for p in primes:
+            assert _prime_factors(p) == [p]
diff --git a/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py b/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py
new file mode 100644
index 0000000000000000000000000000000000000000..638a17bad2ff9ad8684fcbd10d6a3ee8450aaba1
--- /dev/null
+++ b/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py
@@ -0,0 +1,81 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import deepspeed
+from unit.common import DistributedTest
+
+
+class Model(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.emb = torch.nn.EmbeddingBag(10, 3, mode="sum", sparse=True)
+        self.linear = torch.nn.Linear(3, 1)
+
+    def forward(self, x, offsets):
+        return self.linear(self.emb(x, offsets))
+
+
+class Adam(torch.optim.Optimizer):
+    def __init__(self, dense_params, sparse_params):
+        super().__init__(dense_params + sparse_params, defaults={})
+        self.adam = torch.optim.Adam(dense_params)
+        self.adam_sparse = torch.optim.SparseAdam(sparse_params)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        loss_1 = self.adam.step(closure)
+        loss_2 = self.adam_sparse.step(closure)
+
+        if loss_1 is not None and loss_2 is not None:
+            return loss_1 + loss_2
+        return loss_1 or loss_2
+
+
+def get_model_optimizer():
+    torch.manual_seed(0)
+    model = Model()
+    optimizer = Adam(list(model.linear.parameters()), list(model.emb.parameters()))
+    return model, optimizer
+
+
+def get_data(device):
+    x = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=torch.long, device=device)
+    offsets = torch.tensor([0, 4], dtype=torch.long, device=device)
+    y = torch.tensor([[1.0], [0.0]], device=device)
+    return x, offsets, y
+
+
+class TestSparseAdam(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "sparse_gradients": True
+        }
+        model, optimizer = get_model_optimizer()
+        loss = torch.nn.BCEWithLogitsLoss()
+        engine, _, _, _ = deepspeed.initialize(model=model,
+                                              optimizer=optimizer,
+                                              config=config_dict)
+
+        x, offsets, y = get_data(engine.device)
+
+        engine.gradient_average = True
+        res = engine(x, offsets)
+        engine.backward(loss(res, y))
+
+        averaged_grads = {}
+        for k, v in engine.named_parameters():
+            grad = v.grad.to_dense() if v.grad.is_sparse else v.grad
+            averaged_grads[k] = grad
+            v.grad = None
+
+        engine.gradient_average = False
+        res = engine(x, offsets)
+        engine.backward(loss(res, y))
+
+        for k, v in engine.named_parameters():
+            grad = v.grad.to_dense() if v.grad.is_sparse else v.grad
+            assert torch.allclose(grad, averaged_grads[k] * engine.world_size)
diff --git a/tests/unit/runtime/sparse_tensor/test_csr.py b/tests/unit/runtime/sparse_tensor/test_csr.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e4f81b986e870c1a605379bf5c43b30c6be2a4b
--- /dev/null
+++ b/tests/unit/runtime/sparse_tensor/test_csr.py
@@ -0,0 +1,52 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import random
+from deepspeed.runtime.sparse_tensor import SparseTensor
+
+
+def test_csr_addition_self():
+    row_count = 10
+    random.seed(1234)
+
+    x = torch.ones(1, 5)
+    for i in range(row_count - 1):
+        if random.random() > 0.75:
+            x = torch.cat([x, torch.ones(1, 5)])
+        else:
+            x = torch.cat([x, torch.zeros(1, 5)])
+    dense_x = x.clone()
+    cx = SparseTensor(x)
+
+    assert torch.all(dense_x == cx.to_dense())
+
+    cx.add(cx)
+    assert torch.all(dense_x + dense_x == cx.to_dense())
+
+
+def test_csr_addition_different():
+    row_count = 10
+    random.seed(1234)
+
+    x = torch.ones(1, 5)
+    for i in range(row_count - 1):
+        if random.random() > 0.75:
+            x = torch.cat([x, torch.ones(1, 5)])
+        else:
+            x = torch.cat([x, torch.zeros(1, 5)])
+    dense_x = x.clone()
+    cx = SparseTensor(x)
+
+    y = torch.ones(1, 5)
+    for i in range(row_count - 1):
+        if random.random() > 0.75:
+            y = torch.cat([y, torch.ones(1, 5)])
+        else:
+            y = torch.cat([y, torch.zeros(1, 5)])
+    dense_y = y.clone()
+    cy = SparseTensor(y)
+
+    dense_sum = dense_x + dense_y
+    cx.add(cy)
+
+    assert torch.all(dense_sum == cx.to_dense())
diff --git a/tests/unit/runtime/sparse_tensor/test_sparse_grads.py b/tests/unit/runtime/sparse_tensor/test_sparse_grads.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba9a6b0282235b94692fb47c60419c85c0fb3c5b
--- /dev/null
+++ b/tests/unit/runtime/sparse_tensor/test_sparse_grads.py
@@ -0,0 +1,74 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import deepspeed
+from unit.common import DistributedTest
+
+import deepspeed.utils.groups as groups
+
+
+class Model(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.emb = torch.nn.EmbeddingBag(10, 3, mode="sum", sparse=True)
+        self.linear = torch.nn.Linear(3, 1)
+
+    def forward(self, x, offsets):
+        return self.linear(self.emb(x, offsets))
+
+
+class Adam(torch.optim.Optimizer):
+    def __init__(self, dense_params, sparse_params):
+        super().__init__(dense_params + sparse_params, defaults={})
+        self.adam = torch.optim.Adam(dense_params)
+        self.adam_sparse = torch.optim.SparseAdam(sparse_params)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        loss_1 = self.adam.step(closure)
+        loss_2 = self.adam_sparse.step(closure)
+
+        if loss_1 is not None and loss_2 is not None:
+            return loss_1 + loss_2
+        return loss_1 or loss_2
+
+
+class TestSparseAdam(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "sparse_gradients": True
+        }
+
+        model = Model()
+        optimizer = Adam(list(model.linear.parameters()), list(model.emb.parameters()))
+        engine, _, _, _ = deepspeed.initialize(model=model,
+                                              optimizer=optimizer,
+                                              config=config_dict)
+        loss = torch.nn.BCEWithLogitsLoss()
+        x = torch.tensor([1,
+                          2,
+                          4,
+                          5,
+                          4,
+                          3,
+                          2,
+                          9],
+                         dtype=torch.long,
+                         device=engine.device)
+        offsets = torch.tensor([0, 4], dtype=torch.long, device=engine.device)
+        y = torch.tensor([[1.0], [0.0]], device=engine.device)
+        res = engine(x, offsets)
+        engine.backward(loss(res, y))
+        engine.step()
+
+        results = [
+            engine.all_gather_scalar(i,
+                                     groups._get_data_parallel_group())
+            for i in model.emb.parameters()
+        ]
+        for res in results:
+            assert torch.allclose(res[0], res[1])
diff --git a/tests/unit/runtime/test_autocast.py b/tests/unit/runtime/test_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0d8d8696cb8e5611e4f19d9fed98ed6c8483511
--- /dev/null
+++ b/tests/unit/runtime/test_autocast.py
@@ -0,0 +1,76 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import pytest
+import torch
+from deepspeed.runtime.zero.linear import LinearModuleForZeroStage3
+from deepspeed.accelerator import get_accelerator
+from unit.common import DistributedTest
+
+
+@pytest.mark.parametrize('half_op', [False, True])
+class TestAutoCastDisable(DistributedTest):
+    def test_missing_amp_autocast(self, half_op):
+        hidden_dim = 4
+        if half_op:
+            input = torch.randn(hidden_dim).to(get_accelerator().device_name()).half()
+            ds_linear = LinearModuleForZeroStage3(
+                hidden_dim,
+                hidden_dim).to(get_accelerator().device_name()).half()
+        else:
+            input = torch.randn(hidden_dim).to(get_accelerator().device_name())
+            ds_linear = LinearModuleForZeroStage3(hidden_dim,
+                                                  hidden_dim).to(
+                                                      get_accelerator().device_name())
+
+        output = ds_linear(input)
+        assert output.dtype == ds_linear.weight.dtype
+
+    def test_disable_autocast_linear(self, half_op):
+        amp = get_accelerator().amp()
+
+        hidden_dim = 4
+        if half_op:
+            input = torch.randn(hidden_dim).to(get_accelerator().device_name()).half()
+            ds_linear = LinearModuleForZeroStage3(
+                hidden_dim,
+                hidden_dim).to(get_accelerator().device_name()).half()
+        else:
+            input = torch.randn(hidden_dim).to(get_accelerator().device_name())
+            ds_linear = LinearModuleForZeroStage3(hidden_dim,
+                                                  hidden_dim).to(
+                                                      get_accelerator().device_name())
+
+        with amp.autocast(False):
+            output = ds_linear(input)
+            assert output.dtype == ds_linear.weight.dtype
+
+
+@pytest.mark.skipif(get_accelerator().amp() is None, reason='amp is not installed')
+@pytest.mark.parametrize('half_input, half_weight',
+                         [(False,
+                           False),
+                          (False,
+                           True),
+                          (True,
+                           False),
+                          (True,
+                           True)])
+class TestAutoCastEnable(DistributedTest):
+    def test_autocast_linear(self, tmpdir, half_input, half_weight):
+        amp = get_accelerator().amp()
+
+        hidden_dim = 4
+        input = torch.randn(hidden_dim).to(get_accelerator().device_name())
+        ds_linear = LinearModuleForZeroStage3(hidden_dim,
+                                              hidden_dim).to(
+                                                  get_accelerator().device_name())
+
+        if half_input:
+            input = input.half()
+
+        if half_weight:
+            ds_linear = ds_linear.half()
+
+        with amp.autocast():
+            output = ds_linear(input)
+            assert output.dtype == torch.half or output.dtype == torch.bfloat16
diff --git a/tests/unit/runtime/test_data.py b/tests/unit/runtime/test_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed2fee950bc3b193c3dc7f1470c38771cae02a08
--- /dev/null
+++ b/tests/unit/runtime/test_data.py
@@ -0,0 +1,59 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from deepspeed.utils import RepeatingLoader
+import torch
+import pytest
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from unit.common import DistributedTest
+from unit.simple_model import SimpleModel, random_dataset
+
+
+def test_repeating_loader():
+    loader = [1, 2, 3]
+    loader = RepeatingLoader(loader)
+
+    for idx in range(50):
+        assert next(loader) == 1
+        assert next(loader) == 2
+        assert next(loader) == 3
+
+
+@pytest.mark.parametrize('train_batch_size, drop_last',
+                         [(1,
+                           True),
+                          (4,
+                           True),
+                          (1,
+                           False),
+                          (4,
+                           False)])
+class TestDataLoaderDropLast(DistributedTest):
+    world_size = 1
+
+    def test(self, train_batch_size, drop_last):
+        config_dict = {
+            "train_batch_size": train_batch_size,
+            "dataloader_drop_last": drop_last,
+            "steps_per_print": 1
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        optimizer = torch.optim.AdamW(params=model.parameters())
+        # TODO: no way to set DeepSpeedEngine.deepspeed_io params, need to use
+        # pin_memory=False for cuda device
+        train_dataset = random_dataset(total_samples=50,
+                                       hidden_dim=hidden_dim,
+                                       device=torch.device('cpu'),
+                                       dtype=torch.float32)
+        model, _, training_dataloader, _ = deepspeed.initialize(config=config_dict,
+                                                                model=model,
+                                                                training_data=train_dataset,
+                                                                optimizer=optimizer)
+        for n, batch in enumerate(training_dataloader):
+            x = batch[0].to(get_accelerator().current_device_name())
+            y = batch[1].to(get_accelerator().current_device_name())
+            loss = model(x, y)
+            model.backward(loss)
+            model.step()
diff --git a/tests/unit/runtime/test_data_efficiency.py b/tests/unit/runtime/test_data_efficiency.py
new file mode 100644
index 0000000000000000000000000000000000000000..993e4aa66e20d6806d30d90ad9d0487f10091289
--- /dev/null
+++ b/tests/unit/runtime/test_data_efficiency.py
@@ -0,0 +1,228 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import os
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from unit.common import DistributedTest
+from unit.simple_model import Curriculum_SimpleModel, SimpleModel, random_dataloader, random_dataset
+
+
+class MPU():
+    def __init__(self, tp_world_size):
+        self.rank = deepspeed.comm.get_rank()
+        self.world_size = deepspeed.comm.get_world_size()
+        self.tp_world_size = tp_world_size
+
+        for i in range(0, self.world_size, tp_world_size):
+            ranks = range(i, i + tp_world_size)
+            group = deepspeed.comm.new_group(ranks)
+            if self.rank in ranks:
+                self.tp_group = group
+
+        for i in range(0, tp_world_size):
+            ranks = range(i, self.world_size, tp_world_size)
+            group = deepspeed.comm.new_group(ranks)
+            if self.rank in ranks:
+                self.dp_group = group
+
+    def get_model_parallel_rank(self):
+        return self.rank % self.tp_world_size
+
+    def get_model_parallel_world_size(self):
+        return self.tp_world_size
+
+    def get_data_parallel_rank(self):
+        return self.rank // self.tp_world_size
+
+    def get_data_parallel_world_size(self):
+        return self.world_size // self.tp_world_size
+
+    def get_data_parallel_group(self):
+        return self.dp_group
+
+    def get_model_parallel_group(self):
+        return self.tp_group
+
+
+class TestDataEfficiency(DistributedTest):
+    world_size = 2
+
+    def test_curriculum_learning(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015,
+                    "weight_decay": 0.01
+                }
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 16
+            },
+            "data_efficiency": {
+                "enabled": True,
+                "seed": 1234,
+                "data_sampling": {
+                    "enabled": True,
+                    "num_workers": 0,
+                    "curriculum_learning": {
+                        "enabled": True,
+                        "data_cluster_path": "/tmp",
+                        "curriculum_metrics": {
+                            "dummy_metric": {
+                                "index_to_sample_path": "dummy",
+                                "index_to_metric_path": "dummy",
+                                "difficulty_type": "value",
+                                "clustering_type": "single_cluster",
+                                "min_difficulty": 2,
+                                "max_difficulty": 10,
+                                "schedule_type": "fixed_root",
+                                "schedule_config": {
+                                    "total_curriculum_step": 8,
+                                    "difficulty_step": 2,
+                                    "root_degree": 1
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        def data_post_process(data, data_sampler_state_dict):
+            assert 'dummy_metric' in data_sampler_state_dict['current_difficulties']
+            return data
+
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+        dataset = random_dataset(20, hidden_dim, torch.device('cpu'), dtype=torch.half)
+        model, _, data_loader, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              training_data=dataset,
+                                              model_parameters=model.parameters(),
+                                              mpu=MPU(1))
+        if model.mpu.get_data_parallel_rank() == 0 and not os.path.exists('/tmp'):
+            os.makedirs('/tmp')
+        model.set_data_post_process_func(data_post_process)
+        for n, batch in enumerate(data_loader):
+            x = batch[0].to(get_accelerator().current_device_name())
+            y = batch[1].to(get_accelerator().current_device_name())
+            loss = model(x, y)
+            model.backward(loss)
+            model.step()
+            if n >= 10:
+                break
+
+
+class TestLegacyCurriculumScheduler(DistributedTest):
+    world_size = 2
+
+    def test_fixed_discrete(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015,
+                    "weight_decay": 0.01
+                }
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 16
+            },
+            "curriculum_learning": {
+                "enabled": True,
+                "curriculum_type": "seqlen",
+                "min_difficulty": 1,
+                "max_difficulty": 5,
+                "schedule_type": "fixed_discrete",
+                "schedule_config": {
+                    "difficulty": [1,
+                                   2,
+                                   3,
+                                   4,
+                                   5],
+                    "max_step": [2,
+                                 4,
+                                 6,
+                                 8]
+                }
+            }
+        }
+        hidden_dim = 10
+        ground_truths = {1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 3, 7: 4, 8: 4}
+
+        model = Curriculum_SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=20,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss, seqlen = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+            true_seqlen = 5
+            if n + 1 in ground_truths:
+                true_seqlen = ground_truths[n + 1]
+            assert seqlen == true_seqlen, f"Incorrect curriculum schedule"
+
+    def test_fixed_linear(self):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015,
+                    "weight_decay": 0.01
+                }
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": 16
+            },
+            "curriculum_learning": {
+                "enabled": True,
+                "curriculum_type": "seqlen",
+                "min_difficulty": 2,
+                "max_difficulty": 10,
+                "schedule_type": "fixed_linear",
+                "schedule_config": {
+                    "total_curriculum_step": 8,
+                    "difficulty_step": 2
+                }
+            }
+        }
+        hidden_dim = 10
+        ground_truths = {1: 2, 2: 4, 3: 4, 4: 6, 5: 6, 6: 8, 7: 8, 8: 10, 9: 10, 10: 10}
+
+        model = Curriculum_SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=20,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss, seqlen = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+            if n + 1 in ground_truths:
+                true_seqlen = ground_truths[n + 1]
+                assert seqlen == true_seqlen, f"Incorrect curriculum schedule"
diff --git a/tests/unit/runtime/test_ds_config_dict.py b/tests/unit/runtime/test_ds_config_dict.py
new file mode 100644
index 0000000000000000000000000000000000000000..54c91a6fc3e6884e16d63d46ab4cd77b0073e11e
--- /dev/null
+++ b/tests/unit/runtime/test_ds_config_dict.py
@@ -0,0 +1,298 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+# A test on its own
+import os
+import pytest
+import json
+import hjson
+import argparse
+
+from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
+from deepspeed.accelerator import get_accelerator
+
+from unit.common import DistributedTest, get_test_path
+from unit.simple_model import SimpleModel, create_config_from_dict, random_dataloader
+import deepspeed.comm as dist
+
+# A test on its own
+import deepspeed
+from deepspeed.runtime.config import DeepSpeedConfig, get_bfloat16_enabled
+
+
+class TestBasicConfig(DistributedTest):
+    world_size = 1
+
+    def test_accelerator(self):
+        assert (get_accelerator().is_available())
+
+    def test_check_version(self):
+        assert hasattr(deepspeed, "__git_hash__")
+        assert hasattr(deepspeed, "__git_branch__")
+        assert hasattr(deepspeed, "__version__")
+        assert hasattr(deepspeed, "__version_major__")
+        assert hasattr(deepspeed, "__version_minor__")
+        assert hasattr(deepspeed, "__version_patch__")
+
+
+@pytest.fixture
+def base_config():
+    config_dict = {
+        "train_batch_size": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            }
+        },
+        "fp16": {
+            "enabled": True
+        }
+    }
+    return config_dict
+
+
+def _run_batch_config(ds_config, train_batch=None, micro_batch=None, gas=None):
+    ds_config.train_batch_size = train_batch
+    ds_config.train_micro_batch_size_per_gpu = micro_batch
+    ds_config.gradient_accumulation_steps = gas
+    success = True
+    try:
+        ds_config._configure_train_batch_size()
+    except AssertionError:
+        success = False
+    return success
+
+
+def _batch_assert(status, ds_config, batch, micro_batch, gas, success):
+
+    if not success:
+        assert not status
+        print("Failed but All is well")
+        return
+
+    assert ds_config.train_batch_size == batch
+    assert ds_config.train_micro_batch_size_per_gpu == micro_batch
+    assert ds_config.gradient_accumulation_steps == gas
+    print("All is well")
+
+
+#Tests different batch config provided in deepspeed json file
+@pytest.mark.parametrize('num_ranks,batch,micro_batch,gas,success',
+                         [(2,32,16,1,True),
+                         (2,32,8,2,True),
+                         (2,33,17,2,False),
+                         (2,32,18,1,False)]) # yapf: disable
+class TestBatchConfig(DistributedTest):
+    world_size = 2
+
+    def test(self, num_ranks, batch, micro_batch, gas, success):
+        assert dist.get_world_size() == num_ranks, \
+        'The test assumes a world size of f{num_ranks}'
+
+        ds_batch_config = get_test_path('ds_batch_config.json')
+        ds_config = DeepSpeedConfig(ds_batch_config)
+
+        #test cases when all parameters are provided
+        status = _run_batch_config(ds_config,
+                                   train_batch=batch,
+                                   micro_batch=micro_batch,
+                                   gas=gas)
+        _batch_assert(status, ds_config, batch, micro_batch, gas, success)
+
+        #test cases when two out of three parameters are provided
+        status = _run_batch_config(ds_config, train_batch=batch, micro_batch=micro_batch)
+        _batch_assert(status, ds_config, batch, micro_batch, gas, success)
+
+        if success:
+            #when gas is provided with one more parameter
+            status = _run_batch_config(ds_config, train_batch=batch, gas=gas)
+            _batch_assert(status, ds_config, batch, micro_batch, gas, success)
+
+            status = _run_batch_config(ds_config, micro_batch=micro_batch, gas=gas)
+            _batch_assert(status, ds_config, batch, micro_batch, gas, success)
+
+            #test the case when only micro_batch or train_batch is provided
+            if gas == 1:
+                status = _run_batch_config(ds_config, micro_batch=micro_batch)
+                _batch_assert(status, ds_config, batch, micro_batch, gas, success)
+
+                status = _run_batch_config(ds_config, train_batch=batch)
+                _batch_assert(status, ds_config, batch, micro_batch, gas, success)
+        else:
+            #when only gas is provided
+            status = _run_batch_config(ds_config, gas=gas)
+            _batch_assert(status, ds_config, batch, micro_batch, gas, success)
+
+            #when gas is provided with something else and gas does not divide batch
+            if gas != 1:
+                status = _run_batch_config(ds_config, train_batch=batch, gas=gas)
+                _batch_assert(status, ds_config, batch, micro_batch, gas, success)
+
+
+def test_temp_config_json(tmpdir):
+    config_dict = {
+        "train_batch_size": 1,
+    }
+    config_path = create_config_from_dict(tmpdir, config_dict)
+    config_json = json.load(open(config_path, 'r'))
+    assert 'train_batch_size' in config_json
+
+
+@pytest.mark.parametrize("gather_weights_key",
+                         [
+                             "stage3_gather_16bit_weights_on_model_save",
+                             "stage3_gather_fp16_weights_on_model_save"
+                         ])
+def test_gather_16bit_params_on_model_save(gather_weights_key):
+    config_dict = {
+        gather_weights_key: True,
+    }
+    config = DeepSpeedZeroConfig(**config_dict)
+
+    assert config.gather_16bit_weights_on_model_save == True
+
+
+@pytest.mark.parametrize("bf16_key", ["bf16", "bfloat16"])
+def test_get_bfloat16_enabled(bf16_key):
+    cfg = {
+        bf16_key: {
+            "enabled": True,
+        },
+    }
+    assert get_bfloat16_enabled(cfg) == True
+
+
+class TestConfigLoad(DistributedTest):
+    world_size = 1
+
+    def test_dict(self, base_config):
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(config=base_config,
+                                              model=model,
+                                              model_parameters=model.parameters())
+
+    def test_json(self, base_config, tmpdir):
+        config_path = os.path.join(tmpdir, "config.json")
+        with open(config_path, 'w') as fp:
+            json.dump(base_config, fp)
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(config=config_path,
+                                              model=model,
+                                              model_parameters=model.parameters())
+
+    def test_hjson(self, base_config, tmpdir):
+        config_path = os.path.join(tmpdir, "config.json")
+        with open(config_path, 'w') as fp:
+            hjson.dump(base_config, fp)
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(config=config_path,
+                                              model=model,
+                                              model_parameters=model.parameters())
+
+
+class TestDeprecatedDeepScaleConfig(DistributedTest):
+    world_size = 1
+
+    def test(self, base_config, tmpdir):
+        config_path = create_config_from_dict(tmpdir, base_config)
+        parser = argparse.ArgumentParser()
+        args = parser.parse_args(args='')
+        args.deepscale_config = config_path
+        args.local_rank = 0
+
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        model, _, _,_ = deepspeed.initialize(args=args,
+                                             model=model,
+                                             model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=5,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+class TestDistInit(DistributedTest):
+    world_size = 1
+
+    def test(self, base_config):
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        model, _, _,_ = deepspeed.initialize(config=base_config,
+                                             model=model,
+                                             model_parameters=model.parameters(),
+                                             dist_init_required=True)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=5,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+class TestInitNoOptimizer(DistributedTest):
+    world_size = 1
+
+    def test(self, base_config):
+        del base_config["optimizer"]
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim=hidden_dim)
+
+        model, _, _, _ = deepspeed.initialize(config=base_config, model=model)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=5,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            with pytest.raises(AssertionError):
+                model.backward(loss)
+            with pytest.raises(AssertionError):
+                model.step()
+
+
+class TestArgs(DistributedTest):
+    world_size = 1
+
+    def test_none_args(self, base_config):
+        model = SimpleModel(hidden_dim=10)
+        model, _, _, _ = deepspeed.initialize(args=None, model=model, config=base_config)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=5,
+                                        hidden_dim=10,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+
+    def test_no_args(self, base_config):
+        model = SimpleModel(hidden_dim=10)
+        model, _, _, _ = deepspeed.initialize(model=model, config=base_config)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=5,
+                                        hidden_dim=10,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+
+
+class TestNoModel(DistributedTest):
+    world_size = 1
+
+    def test(self, base_config):
+        model = SimpleModel(hidden_dim=10)
+        with pytest.raises(AssertionError):
+            model, _, _, _ = deepspeed.initialize(model=None, config=base_config)
+
+        with pytest.raises(AssertionError):
+            model, _, _, _ = deepspeed.initialize(model, config=base_config)
diff --git a/tests/unit/runtime/test_ds_config_model.py b/tests/unit/runtime/test_ds_config_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..24343a999f695ea3ad7ef22fde50dc0fdb219269
--- /dev/null
+++ b/tests/unit/runtime/test_ds_config_model.py
@@ -0,0 +1,88 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import pytest
+import os
+import json
+from pydantic import Field, ValidationError
+from typing import List
+from deepspeed.runtime import config as ds_config
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+
+
+class SimpleConf(DeepSpeedConfigModel):
+    param_1: int = 0
+    param_2_old: str = Field(None,
+                             deprecated=True,
+                             new_param="param_2",
+                             new_param_fn=(lambda x: [x]))
+    param_2: List[str] = None
+    param_3: int = Field(0, alias="param_3_alias")
+
+
+def test_only_required_fields(tmpdir):
+    '''Ensure that config containing only the required fields is accepted. '''
+    cfg_json = tmpdir.mkdir('ds_config_unit_test').join('minimal.json')
+
+    with open(cfg_json, 'w') as f:
+        required_fields = {'train_batch_size': 64}
+        json.dump(required_fields, f)
+
+    run_cfg = ds_config.DeepSpeedConfig(cfg_json)
+    assert run_cfg is not None
+    assert run_cfg.train_batch_size == 64
+    assert run_cfg.train_micro_batch_size_per_gpu == 64
+    assert run_cfg.gradient_accumulation_steps == 1
+
+
+def test_config_duplicate_key(tmpdir):
+    config_dict = '''
+    {
+        "train_batch_size": 24,
+        "train_batch_size": 24,
+    }
+    '''
+    config_path = os.path.join(tmpdir, 'temp_config.json')
+
+    with open(config_path, 'w') as jf:
+        jf.write("%s" % config_dict)
+
+    with pytest.raises(ValueError):
+        run_cfg = ds_config.DeepSpeedConfig(config_path)
+
+
+def test_config_base():
+    config = SimpleConf(**{"param_1": 42})
+    assert config.param_1 == 42
+
+
+def test_config_base_deprecatedfield():
+    config = SimpleConf(**{"param_2_old": "DS"})
+    assert config.param_2 == ["DS"]
+
+
+def test_config_base_aliasfield():
+    config = SimpleConf(**{"param_3": 10})
+    assert config.param_3 == 10
+
+    config = SimpleConf(**{"param_3_alias": 10})
+    assert config.param_3 == 10
+
+
+@pytest.mark.parametrize("config_dict",
+                         [{
+                             "param_1": "DS"
+                         },
+                          {
+                              "param_2": "DS"
+                          },
+                          {
+                              "param_1_typo": 0
+                          }])
+def test_config_base_literalfail(config_dict):
+    with pytest.raises(ValidationError):
+        config = SimpleConf(**config_dict)
+
+
+def test_config_base_deprecatedfail():
+    with pytest.raises(AssertionError):
+        config = SimpleConf(**{"param_2": ["DS"], "param_2_old": "DS"})
diff --git a/tests/unit/runtime/test_ds_initialize.py b/tests/unit/runtime/test_ds_initialize.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7eeef863bdaf12bab84a66597c17263ba6a512f
--- /dev/null
+++ b/tests/unit/runtime/test_ds_initialize.py
@@ -0,0 +1,280 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import pytest
+from typing import Callable
+import torch
+from torch.optim import Optimizer, Adam, AdamW
+from torch.optim.lr_scheduler import _LRScheduler, LambdaLR
+
+from unit.simple_model import SimpleModel, random_dataloader
+from unit.common import DistributedTest
+from unit.util import required_torch_version, bf16_required_version_check, required_amp_check
+
+import deepspeed
+from deepspeed.ops.adam import FusedAdam
+from deepspeed.runtime.lr_schedules import WARMUP_LR, WarmupLR
+from deepspeed.runtime.config import ADAM_OPTIMIZER
+from deepspeed.runtime.utils import see_memory_usage
+
+
+@pytest.mark.parametrize('zero_stage', [0, 3])
+class TestNoOptim(DistributedTest):
+    world_size = 1
+
+    def test(self, zero_stage):
+        if zero_stage == 3 and not required_torch_version():
+            pytest.skip("zero-3 param offload requires at least torch 1.8")
+
+        ds_config = {
+            'train_batch_size': self.world_size,
+            'fp16': {
+                'enabled': True
+            },
+            'zero_optimization': {
+                "stage": zero_stage,
+                "offload_param": {
+                    "device": "cpu"
+                }
+            }
+        }
+        # 20B test
+        #hidden_dim = 16 * 1024
+        hidden_dim = 4
+
+        with deepspeed.zero.Init(enabled=zero_stage == 3, config_dict_or_path=ds_config):
+            model = SimpleModel(hidden_dim, nlayers=78)
+        see_memory_usage('pre-init', force=True)
+        model, _, _, _ = deepspeed.initialize(model=model, config=ds_config)
+        see_memory_usage('post-init', force=True)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.half)
+        for batch in data_loader:
+            model(batch[0], batch[1])
+        see_memory_usage('post-fwds', force=True)
+
+
+@pytest.mark.parametrize('optimizer_type', [None, Optimizer, Callable])
+class TestClientOptimizer(DistributedTest):
+    world_size = 1
+
+    def test(self, optimizer_type):
+        def _optimizer_callable(params) -> Optimizer:
+            return AdamW(params=params)
+
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+
+        config_dict = {'train_batch_size': 1}
+        if optimizer_type is None:
+            client_optimizer = None
+            config_dict['optimizer'] = {'type': ADAM_OPTIMIZER}
+        elif optimizer_type is Optimizer:
+            client_optimizer = Adam(model.parameters())
+        else:
+            client_optimizer = _optimizer_callable
+
+        _, ds_optimizer, _, _ = deepspeed.initialize(config=config_dict,
+                                                    model=model,
+                                                    model_parameters=list(model.parameters()),
+                                                    optimizer=client_optimizer)
+        if client_optimizer is None:
+            assert isinstance(ds_optimizer, FusedAdam)
+        elif isinstance(client_optimizer, Optimizer):
+            assert ds_optimizer == client_optimizer
+        else:
+            assert isinstance(ds_optimizer, AdamW)
+
+
+@pytest.mark.parametrize('client_parameters', [True, False])
+class TestConfigOptimizer(DistributedTest):
+    world_size = 1
+
+    def test(self, client_parameters):
+        ds_config = {
+            "train_batch_size": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.001
+                }
+            }
+        }
+
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+
+        if client_parameters:
+            model_parameters = list(model.parameters())
+        else:
+            model_parameters = None
+
+        _, ds_optimizer, _, _ = deepspeed.initialize(config=ds_config,
+                                                    model=model,
+                                                    model_parameters=model_parameters)
+
+        assert isinstance(ds_optimizer, FusedAdam)
+
+
+@pytest.mark.parametrize('optimizer_extension', ['zero1', 'zero2', 'amp', None])
+@pytest.mark.parametrize('model_dtype', ['fp16', 'bf16', 'fp32'])
+@pytest.mark.parametrize('grad_accum_dtype', [None, 'fp16', 'bf16', 'fp32'])
+class TestOptimizerImplementation(DistributedTest):
+    world_size = 1
+
+    def test(self, optimizer_extension, model_dtype, grad_accum_dtype):
+        if optimizer_extension == 'zero1':
+            zero_stage = 1
+        elif optimizer_extension == 'zero2':
+            zero_stage = 2
+        else:
+            zero_stage = 0
+        amp = True if optimizer_extension == 'amp' else False
+        fp16 = True if model_dtype == 'fp16' else False
+        bf16 = True if model_dtype == 'bf16' else False
+        # Skip checks
+        if bf16 and not bf16_required_version_check():
+            pytest.skip(
+                "DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
+            )
+        if amp and not required_amp_check():
+            pytest.skip("Amp is not installed can't run amp check")
+        # Config declaration
+        ds_config = {
+            "train_batch_size": 1,
+            'fp16': {
+                'enabled': fp16
+            },
+            'bf16': {
+                'enabled': bf16
+            },
+            'amp': {
+                'enabled': amp
+            },
+            'zero_optimization': {
+                "stage": zero_stage
+            },
+            "data_types": {
+                "grad_accum_dtype": grad_accum_dtype
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.001
+                }
+            }
+        }
+
+        key = (optimizer_extension, model_dtype, grad_accum_dtype)
+
+        # Enumerate supported configurations
+        is_supported = {}
+        # ZeRO 1 Wrapper
+        is_supported[('zero1', 'fp16', None)] = True
+        is_supported[('zero1', 'fp16', 'fp16')] = True
+        is_supported[('zero1', 'bf16', None)] = True
+        is_supported[('zero1', 'bf16', 'bf16')] = True
+        is_supported[('zero1', 'bf16', 'fp32')] = True
+        is_supported[('zero1', 'fp32', None)] = True
+        is_supported[('zero1', 'fp32', 'fp32')] = True
+        # ZeRO 2 Wrapper
+        is_supported[('zero2', 'fp16', None)] = True
+        is_supported[('zero2', 'fp16', 'fp16')] = True
+        is_supported[('zero2', 'bf16', None)] = True
+        is_supported[('zero2', 'bf16', 'bf16')] = True
+        is_supported[('zero2', 'fp32', None)] = True
+        is_supported[('zero2', 'fp32', 'fp32')] = True
+        # Amp Wrapper
+        is_supported[('amp', 'fp32', None)] = True
+        is_supported[('amp', 'fp32', 'fp32')] = True
+        # FP16 Wrapper
+        is_supported[(None, 'fp16', None)] = True
+        is_supported[(None, 'fp16', 'fp16')] = True
+        # BF16 Wrapper
+        is_supported[(None, 'bf16', 'fp32')] = True
+        is_supported[(None, 'bf16', None)] = True
+        # No Wrapper
+        is_supported[(None, 'fp32', None)] = True
+        is_supported[(None, 'fp32', 'fp32')] = True
+
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+        model_parameters = list(model.parameters())
+
+        if key in is_supported:
+            _, ds_optimizer, _, _ = deepspeed.initialize(config=ds_config,
+                                                        model=model,
+                                                        model_parameters=model_parameters)
+            assert True
+        else:
+            with pytest.raises(NotImplementedError):
+                _, ds_optimizer, _, _ = deepspeed.initialize(config=ds_config,
+                                                            model=model,
+                                                            model_parameters=model_parameters)
+
+
+@pytest.mark.parametrize("scheduler_type", [None, _LRScheduler, Callable])
+@pytest.mark.parametrize("optimizer_type", [None, Optimizer, Callable])
+class TestClientLrScheduler(DistributedTest):
+    world_size = 1
+
+    def test(self, scheduler_type, optimizer_type):
+        def _my_lambda(epoch):
+            return epoch // 10
+
+        def _optimizer_callable(params) -> Optimizer:
+            return torch.optim.AdamW(params=params)
+
+        def _lr_scheduler_callable(optimizer) -> _LRScheduler:
+            return LambdaLR(optimizer, _my_lambda)
+
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+
+        config_dict = {'train_batch_size': 1}
+
+        client_optimizer = None
+        client_scheduler = None
+
+        if optimizer_type is None:
+            config_dict['optimizer'] = {'type': ADAM_OPTIMIZER}
+        elif optimizer_type is Optimizer:
+            client_optimizer = torch.optim.Adam(model.parameters())
+        else:
+            client_optimizer = _optimizer_callable
+
+        if scheduler_type is None:
+            config_dict['scheduler'] = {'type': WARMUP_LR, 'params': {}}
+        elif scheduler_type == _LRScheduler:
+            if isinstance(client_optimizer, Optimizer):
+                client_scheduler = LambdaLR(client_optimizer, _my_lambda)
+            else:
+                # Verify invalid combination is correctly handled
+                client_scheduler = LambdaLR(torch.optim.Adam(model.parameters()),
+                                            _my_lambda)
+        else:
+            client_scheduler = _lr_scheduler_callable
+
+        if isinstance(client_scheduler,
+                      _LRScheduler) and not isinstance(client_optimizer,
+                                                       Optimizer):
+            with pytest.raises(AssertionError):
+                _, _, _, _ = deepspeed.initialize(config=config_dict,
+                                                  model=model,
+                                                  model_parameters=list(model.parameters()),
+                                                  optimizer=client_optimizer,
+                                                  lr_scheduler=client_scheduler)
+        else:
+            _, _, _, ds_lr_scheduler = deepspeed.initialize(config=config_dict,
+                                                            model=model,
+                                                            model_parameters=list(model.parameters()),
+                                                            optimizer=client_optimizer,
+                                                            lr_scheduler=client_scheduler)
+            if client_scheduler is None:
+                assert isinstance(ds_lr_scheduler, WarmupLR)
+            elif isinstance(client_scheduler, _LRScheduler):
+                assert ds_lr_scheduler == client_scheduler
+            else:
+                assert isinstance(ds_lr_scheduler, LambdaLR)
diff --git a/tests/unit/runtime/test_lr_schedulers.py b/tests/unit/runtime/test_lr_schedulers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7afcad5426c4e5c7132ab9a97a3459681eac9289
--- /dev/null
+++ b/tests/unit/runtime/test_lr_schedulers.py
@@ -0,0 +1,455 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import deepspeed
+import pytest
+from unit.common import DistributedTest
+from unit.simple_model import SimpleModel, random_dataloader
+from deepspeed.runtime.lr_schedules import LR_RANGE_TEST, LR_RANGE_TEST_MIN_LR, LR_RANGE_TEST_STEP_RATE, LR_RANGE_TEST_STEP_SIZE, LR_RANGE_TEST_STAIRCASE
+from deepspeed.runtime.lr_schedules import WARMUP_LR, WARMUP_MIN_LR, WARMUP_MAX_LR, WARMUP_NUM_STEPS, WARMUP_TYPE, WARMUP_LOG_RATE, WARMUP_LINEAR_RATE
+from deepspeed.runtime.lr_schedules import ONE_CYCLE, CYCLE_MIN_LR, CYCLE_MAX_LR, CYCLE_FIRST_STEP_SIZE, DECAY_LR_RATE, DECAY_STEP_SIZE
+from deepspeed.runtime.lr_schedules import CYCLE_MIN_MOM, CYCLE_MAX_MOM, DECAY_MOM_RATE
+from deepspeed.runtime.lr_schedules import WARMUP_DECAY_LR, TOTAL_NUM_STEPS
+
+
+def _verify_continuous_decrease(values):
+    for i in range(len(values) - 1):
+        assert values[i] > values[i + 1]
+
+
+def _verify_continuous_increase(values):
+    for i in range(len(values) - 1):
+        assert values[i] < values[i + 1]
+
+
+def _verify_staircase_increase(values, step_size):
+    num_values = len(values)
+    for i in range(0, num_values, step_size):
+        j = min(i + step_size, num_values)
+        assert all([values[i] == v for v in values[i:j]])
+
+
+@pytest.mark.parametrize("scheduler_type,params",
+                         [(WARMUP_LR,
+                           {}),
+                          (WARMUP_DECAY_LR,
+                           {
+                               WARMUP_NUM_STEPS: 10,
+                               TOTAL_NUM_STEPS: 20
+                           }),
+                          (ONE_CYCLE,
+                           {
+                               CYCLE_MIN_LR: 0,
+                               CYCLE_MAX_LR: 0.1
+                           }),
+                          (LR_RANGE_TEST,
+                           {})])
+class TestGetLrBeforeTrain(DistributedTest):
+    world_size = 1
+
+    def test(self, scheduler_type, params):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                },
+            },
+            "scheduler": {
+                "type": scheduler_type,
+                "params": params
+            },
+            "gradient_clipping": 1.0
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim, empty_grad=False)
+        model, _, _, lr_scheduler = deepspeed.initialize(config=config_dict,
+                                                         model=model,
+                                                         model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+        for n, batch in enumerate(data_loader):
+            # get lr before training starts
+            lr_scheduler.get_lr()
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+@pytest.mark.parametrize("warmup_num_steps", [10, 15, 19, 33])
+@pytest.mark.parametrize("warmup_type", [WARMUP_LOG_RATE, WARMUP_LINEAR_RATE])
+class TestLrSchedule(DistributedTest):
+    world_size = 1
+
+    def test_lr_warmup_schedule(self, warmup_num_steps, warmup_type):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                },
+            },
+            "scheduler": {
+                "type": WARMUP_LR,
+                "params": {
+                    WARMUP_MIN_LR: 0.1,
+                    WARMUP_MAX_LR: 0.2,
+                    WARMUP_NUM_STEPS: warmup_num_steps,
+                    WARMUP_TYPE: warmup_type,
+                }
+            },
+            "gradient_clipping": 1.0
+        }
+        schedule_params = config_dict["scheduler"]["params"]
+        total_num_steps = 2 * warmup_num_steps
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim, empty_grad=False)
+        model, _, _, lr_scheduler = deepspeed.initialize(config=config_dict,
+                                                         model=model,
+                                                         model_parameters=model.parameters())
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=total_num_steps * 2,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+        step_lrs = []
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+            step_lrs.append(lr_scheduler.get_lr())
+
+        # Verify initial lr
+        assert step_lrs[0] == [schedule_params[WARMUP_MIN_LR]]
+
+        # Verify warmup completion
+        warmup_num_steps = schedule_params[WARMUP_NUM_STEPS]
+        warmup_max_lr = [schedule_params[WARMUP_MAX_LR]]
+        assert step_lrs[warmup_num_steps] == warmup_max_lr
+
+        # Verify post-warmup completion
+        assert all([warmup_max_lr == lr for lr in step_lrs[warmup_num_steps:]])
+
+    def test_lr_warmup_decay_schedule(self, warmup_num_steps, warmup_type):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                },
+            },
+            "scheduler": {
+                "type": WARMUP_DECAY_LR,
+                "params": {
+                    WARMUP_MIN_LR: 0.1,
+                    WARMUP_MAX_LR: 0.2,
+                    WARMUP_NUM_STEPS: warmup_num_steps,
+                    TOTAL_NUM_STEPS: warmup_num_steps * 2,
+                    WARMUP_TYPE: warmup_type
+                }
+            },
+            "gradient_clipping": 1.0
+        }
+        schedule_params = config_dict["scheduler"]["params"]
+        total_num_steps = schedule_params[TOTAL_NUM_STEPS]
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim, empty_grad=False)
+        model, _, _, lr_scheduler = deepspeed.initialize(config=config_dict,
+                                                         model=model,
+                                                         model_parameters=model.parameters())
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=total_num_steps * 2,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+        step_lrs = []
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+            step_lrs.append(lr_scheduler.get_lr())
+
+        # Verify initial lr
+        assert step_lrs[0] == [schedule_params[WARMUP_MIN_LR]]
+
+        # Verify lr at warmup completion
+        warmup_num_steps = schedule_params[WARMUP_NUM_STEPS]
+        warmup_max_lr = [schedule_params[WARMUP_MAX_LR]]
+        assert step_lrs[warmup_num_steps] == warmup_max_lr
+
+        # Verify decay phase
+        previous_lr = warmup_max_lr
+        for lr in step_lrs[warmup_num_steps + 1:]:
+            assert lr < previous_lr
+            previous_lr = lr
+
+
+@pytest.mark.parametrize("scheduler_type,params",
+                         [(WARMUP_LR,
+                           {}),
+                          (WARMUP_DECAY_LR,
+                           {
+                               WARMUP_NUM_STEPS: 5,
+                               TOTAL_NUM_STEPS: 10
+                           }),
+                          (ONE_CYCLE,
+                           {
+                               CYCLE_MIN_LR: 0,
+                               CYCLE_MAX_LR: 0.1,
+                               CYCLE_FIRST_STEP_SIZE: 5,
+                               DECAY_STEP_SIZE: 5
+                           }),
+                          (LR_RANGE_TEST,
+                           {
+                               LR_RANGE_TEST_MIN_LR: 1e-4,
+                               LR_RANGE_TEST_STEP_SIZE: 1
+                           })])
+class TestSchedulerOptimizerParity(DistributedTest):
+    world_size = 1
+
+    def test(self, scheduler_type, params):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                },
+            },
+            "scheduler": {
+                "type": scheduler_type,
+                "params": params
+            },
+            "gradient_clipping": 1.0
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim, empty_grad=False)
+        model, _, _, lr_scheduler = deepspeed.initialize(config=config_dict,
+                                                         model=model,
+                                                         model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+            assert lr_scheduler.get_lr() == model.get_lr()
+
+
+@pytest.mark.parametrize("min_lr, step_rate, step_size, staircase",
+                         [(1e-4, 1e-5, 1, True),
+                          (1e-5, 1e-5, 1, False),
+                          (1e-4, 1e-3, 10, True),
+                          (1e-3, 1e-3, 10, False),
+                          (1e-2, 1e-2, 19, True),
+                          (1e-2, 1e-2, 19, False)
+                           ])# yapf: disable
+class TestLrRange(DistributedTest):
+    world_size = 1
+
+    def test(self, min_lr, step_rate, step_size, staircase):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                },
+            },
+            "scheduler": {
+                "type": LR_RANGE_TEST,
+                "params": {
+                    LR_RANGE_TEST_MIN_LR: min_lr,
+                    LR_RANGE_TEST_STEP_RATE: step_rate,
+                    LR_RANGE_TEST_STEP_SIZE: step_size,
+                    LR_RANGE_TEST_STAIRCASE: staircase
+                }
+            },
+            "gradient_clipping": 1.0
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim, empty_grad=False)
+        model, _, _, lr_scheduler = deepspeed.initialize(config=config_dict,
+                                                         model=model,
+                                                         model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=max(50,
+                                                          step_size * 2),
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+
+        step_lrs = []
+        for _, batch in enumerate(data_loader):
+            step_lrs.extend(lr_scheduler.get_lr())
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+        # Verify starting lr
+        assert step_lrs[0] == min_lr
+
+        if staircase:
+            # Verify staircase increasing lr
+            _verify_staircase_increase(step_lrs, step_size)
+        else:
+            # Verify continuous increasing lr
+            _verify_continuous_increase(step_lrs)
+
+
+class TestOneCycle(DistributedTest):
+    world_size = 1
+
+    @pytest.mark.parametrize("min_lr, max_lr, decay_rate, cycle_step_size, decay_step_size",
+                             [
+                                 (1e-5, 1e-2, 1e-3, 10, 10),
+                                 (1e-3, 1e-1, 0, 21, 21),
+                                 (1e-5, 1e-2, 1e-3, 10, 10),
+                                 (1e-3, 1e-1, 1e-1, 21, 21),
+                                 (1e-5, 1e-1, 0, 10, 0),
+                             ])  # yapf: disable
+    def test_lr(self, min_lr, max_lr, decay_rate, cycle_step_size, decay_step_size):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                },
+            },
+            "scheduler": {
+                "type": ONE_CYCLE,
+                "params": {
+                    CYCLE_MIN_LR: min_lr,
+                    CYCLE_MAX_LR: max_lr,
+                    DECAY_LR_RATE: decay_rate,
+                    CYCLE_FIRST_STEP_SIZE: cycle_step_size,
+                    DECAY_STEP_SIZE: decay_step_size
+                }
+            },
+            "gradient_clipping": 1.0
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim, empty_grad=False)
+        model, _, _, lr_scheduler = deepspeed.initialize(config=config_dict,
+                                                         model=model,
+                                                         model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=max(50,
+                                                          cycle_step_size * 3),
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+
+        step_lrs = []
+        for _, batch in enumerate(data_loader):
+            step_lrs.extend(lr_scheduler.get_lr())
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+        # Verify starting lr
+        assert step_lrs[0] == min_lr
+
+        # Verify peak lr
+        assert step_lrs[cycle_step_size] == max_lr
+
+        # Verify increasing phase
+        _verify_continuous_increase(step_lrs[:cycle_step_size])
+
+        # Verify decreasing phase
+        _verify_continuous_decrease(step_lrs[cycle_step_size:(cycle_step_size * 2)])
+
+        # Verify decay phase
+        if decay_rate > 0:
+            _verify_continuous_decrease(step_lrs[(cycle_step_size * 2):])
+
+    @pytest.mark.parametrize("min_mom, max_mom, decay_rate, step_size",
+                             [
+                                 (0.08, 0.09, 1e-3, 10),
+                                 (0.08, 0.09, 0, 21),
+                                 (0.08, 0.09, 1e-3, 10),
+                                 (0.08, 0.09, 0, 21),
+                             ]) # yapf: disable
+    def test_mom(self, min_mom, max_mom, decay_rate, step_size):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                },
+            },
+            "scheduler": {
+                "type": ONE_CYCLE,
+                "params": {
+                    CYCLE_MIN_LR: 1e-3,
+                    CYCLE_MAX_LR: 1e-2,
+                    CYCLE_MIN_MOM: min_mom,
+                    CYCLE_MAX_MOM: max_mom,
+                    DECAY_MOM_RATE: decay_rate,
+                    CYCLE_FIRST_STEP_SIZE: step_size,
+                    DECAY_STEP_SIZE: step_size
+                }
+            },
+            "gradient_clipping": 1.0
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim, empty_grad=False)
+        model, _, _, lr_scheduler = deepspeed.initialize(config=config_dict,
+                                                         model=model,
+                                                         model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=max(50,
+                                                          step_size * 3),
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+
+        step_moms = []
+        for _, batch in enumerate(data_loader):
+            step_moms.append(lr_scheduler.get_mom())
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+        # Verify starting lr
+        assert step_moms[0][0][0] == max_mom
+
+        # Verify peak lr
+        assert step_moms[step_size][0][0] == min_mom
+
+        # Verify decreasing phase
+        _verify_continuous_decrease(step_moms[:step_size])
+
+        # Verify increasing phase
+        _verify_continuous_increase(step_moms[step_size:(step_size * 2)])
+
+        # Verify decay phase
+        if decay_rate > 0:
+            _verify_continuous_increase(step_moms[(step_size * 2):])
diff --git a/tests/unit/runtime/test_multi_output_model.py b/tests/unit/runtime/test_multi_output_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a802373a67a04f394def10958cd0a966251d7c4
--- /dev/null
+++ b/tests/unit/runtime/test_multi_output_model.py
@@ -0,0 +1,136 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import deepspeed
+from pytest import approx
+from unit.common import DistributedTest
+from unit.multi_output_model import MultiOutputModel, multi_output_dataloader
+
+
+class TestTwoOutputModel(DistributedTest):
+    world_size = 1
+
+    def test(self, tmpdir):
+        grad_accumulation_steps = 2
+        micro_batch_size = 1
+        world_size = self.world_size
+        config_dict = {
+            "train_micro_batch_size_per_gpu": micro_batch_size,
+            "gradient_accumulation_steps": grad_accumulation_steps,
+            "train_batch_size": micro_batch_size * grad_accumulation_steps * world_size,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "fp16": {
+                "enabled": True
+            }
+        }
+
+        hidden_dim = 10
+        weight_value = 0.1
+
+        model = MultiOutputModel(hidden_dim, weight_value)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        total_samples = 4
+        data_loader = multi_output_dataloader(model=model,
+                                              total_samples=total_samples,
+                                              hidden_dim=hidden_dim,
+                                              device=model.device,
+                                              inputs=[1.0,
+                                                      2.0],
+                                              targets=[1,
+                                                       2])
+        for n, batch in enumerate(data_loader):
+            assert len(batch) % 2 == 0, \
+                 f"multi_output_dataloader failed to return even number of data samples (input+target)"
+
+            midpoint = len(batch) // 2
+            inputs, targets = batch[:midpoint], batch[midpoint:]
+            loss_tuple = model(inputs, targets)
+
+            expected_loss = torch.tensor(2.302734375,
+                                         dtype=torch.half,
+                                         device=model.device)
+            for loss in loss_tuple:
+                assert loss.shape == torch.Size([])
+                assert loss.item() == approx(expected_loss.item())
+
+            summed_loss = sum(loss_tuple)
+            scaled_loss = model.backward(summed_loss)
+            expected_scaled_loss = summed_loss.float() / grad_accumulation_steps
+            assert scaled_loss.item() == approx(expected_scaled_loss.item())
+
+            model.step()
+
+
+class TestThreeOutputModel(DistributedTest):
+    world_size = 1
+
+    def test(self, tmpdir):
+        grad_accumulation_steps = 3
+        micro_batch_size = 1
+        world_size = 1
+        config_dict = {
+            "train_micro_batch_size_per_gpu": micro_batch_size,
+            "gradient_accumulation_steps": grad_accumulation_steps,
+            "train_batch_size": micro_batch_size * grad_accumulation_steps * world_size,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "fp16": {
+                "enabled": True
+            }
+        }
+
+        hidden_dim = 10
+        weight_value = 0.1
+
+        model = MultiOutputModel(hidden_dim, weight_value)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+
+        total_samples = grad_accumulation_steps * micro_batch_size * 2
+        data_loader = multi_output_dataloader(model=model,
+                                              total_samples=total_samples,
+                                              hidden_dim=hidden_dim,
+                                              device=model.device,
+                                              inputs=[1.0,
+                                                      2.0,
+                                                      3.0],
+                                              targets=[1,
+                                                       2,
+                                                       3])
+        for n, batch in enumerate(data_loader):
+            assert len(batch) % 2 == 0, \
+                 f"multi_output_dataloader failed to return even number of data samples (input+target)"
+
+            midpoint = len(batch) // 2
+            inputs, targets = batch[:midpoint], batch[midpoint:]
+            loss_tuple = model(inputs, targets)
+            assert len(loss_tuple) == 3
+
+            expected_loss = torch.tensor(2.302734375,
+                                         dtype=torch.half,
+                                         device=model.device)
+
+            for loss in loss_tuple:
+                assert loss.shape == torch.Size([])
+                assert loss.item() == approx(expected_loss.item())
+
+            summed_loss = sum(loss_tuple)
+            scaled_loss = model.backward(summed_loss)
+            expected_scaled_loss = summed_loss.float() / grad_accumulation_steps
+            assert scaled_loss.item() == approx(expected_scaled_loss.item())
+
+            model.step()
diff --git a/tests/unit/runtime/test_pld.py b/tests/unit/runtime/test_pld.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b8ed2365d77678fda2eb55bd74c9715dbaa1129
--- /dev/null
+++ b/tests/unit/runtime/test_pld.py
@@ -0,0 +1,108 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import numpy as np
+import deepspeed
+import pytest
+from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
+
+from unit.common import DistributedTest
+from unit.simple_model import SimpleModel, PLD_SimpleModel, random_dataloader
+
+
+@pytest.mark.parametrize('theta', [0, 0.1, 0.9, 1.0])
+def test_pld_schedule(tmpdir, theta):
+    gamma = 0.001
+
+    pld_scheduler = ProgressiveLayerDrop(theta, gamma)
+    for i in range(10):
+        pld_scheduler.update_state(i)
+        expected_theta = (1. - theta) * np.exp(-gamma * i) + theta
+        actual_theta = pld_scheduler.get_theta()
+        assert expected_theta == actual_theta
+
+
+@pytest.mark.parametrize('theta', [0, 0.1, 0.9, 1.0])
+class TestPLDModel(DistributedTest):
+    world_size = 1
+
+    def test_pld_model(self, theta):
+        gamma = 0.001
+        config_dict = {
+            "train_batch_size": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": 'Adam',
+                "params": {
+                    "lr": 0.0001
+                }
+            },
+            "fp16": {
+                "enabled": True
+            },
+            "progressive_layer_drop": {
+                "enabled": True,
+                "theta": theta,
+                "gamma": gamma
+            }
+        }
+        hidden_dim = 10
+
+        model = PLD_SimpleModel(hidden_dim, empty_grad=False)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+
+        for i, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+            expected_theta = (1. - theta) * np.exp(-gamma * i) + theta
+            actual_theta = model.get_pld_theta()
+            assert expected_theta == actual_theta
+
+
+class TestNonPLDModel(DistributedTest):
+    world_size = 1
+
+    def test_non_pld_model(self):
+        gamma = 0.001
+        theta = 0.5
+        config_dict = {
+            "train_batch_size": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": 'Adam',
+                "params": {
+                    "lr": 0.0001
+                }
+            },
+            "fp16": {
+                "enabled": True
+            },
+            "progressive_layer_drop": {
+                "enabled": True,
+                "theta": theta,
+                "gamma": gamma
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim, empty_grad=False)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=1,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+
+        for i, batch in enumerate(data_loader):
+            with pytest.raises(TypeError):
+                loss = model(batch[0], batch[1])
diff --git a/tests/unit/runtime/test_runtime_utils.py b/tests/unit/runtime/test_runtime_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..18a8bb77a5b6a63fe644b288571f808ca4c00e82
--- /dev/null
+++ b/tests/unit/runtime/test_runtime_utils.py
@@ -0,0 +1,78 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from torch._utils import _flatten_dense_tensors
+import deepspeed.comm as dist
+import pytest
+
+import deepspeed.runtime.utils as ds_utils
+import deepspeed.utils.groups as groups
+from deepspeed.accelerator import get_accelerator
+
+from unit.common import DistributedTest
+
+
+def test_call_to_str():
+    c2s = ds_utils.call_to_str
+
+    assert c2s('int') == 'int()'
+    assert c2s('int', 3) == 'int(3)'
+    assert c2s('int', 3, 'jeff') == 'int(3, \'jeff\')'
+
+    assert c2s('hello', val=3) == 'hello(val=3)'
+    assert c2s('hello', 1138, val=3) == 'hello(1138, val=3)'
+
+
+class TestClibGradNorm(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        param1 = torch.nn.Parameter(torch.Tensor([0]))
+        param1.grad = torch.Tensor([1])
+        param2 = torch.nn.Parameter(torch.Tensor([0]))
+        param2.grad = torch.Tensor([dist.get_rank() + 1])
+        # param2 is now MoE parameter
+        param2.allreduce = False
+
+        parameters = [param1, param2]
+
+        groups._create_expert_and_data_parallel(2)
+
+        norm = ds_utils.clip_grad_norm_(parameters, max_norm=0.1)
+        norm = torch.Tensor([norm]).to(get_accelerator().device_name(dist.get_rank()))
+        world_size = dist.get_world_size()
+        gathered_norm = [
+            torch.zeros(1).to(get_accelerator().device_name()) for i in range(world_size)
+        ]
+
+        dist.all_gather(gathered_norm, norm)
+
+        assert gathered_norm[0] == gathered_norm[1], "norm at rank 0 does not match the norm at rank 1"
+
+
+@pytest.mark.parametrize("check_using_norm", [(False), (True)])
+class TestCheckOverflow(DistributedTest):
+    world_size = 2
+
+    def test(self, check_using_norm):
+        groups._create_expert_and_data_parallel(2)
+
+        param1 = torch.nn.Parameter(torch.Tensor([0]))
+        param1.grad = torch.Tensor([1])
+        param2 = torch.nn.Parameter(torch.Tensor([0]))
+        if dist.get_rank() == 0:
+            param2.grad = torch.Tensor([1])
+        else:
+            param2.grad = torch.Tensor([float("inf")])
+        param2.allreduce = False
+        # param2 is now MoE parameter
+        parameters = [param1, param2]
+        if check_using_norm:
+            grads_group_flat = [_flatten_dense_tensors([p.grad for p in parameters])]
+            norm = ds_utils.get_weight_norm(grads_group_flat)
+            overflow_checker = ds_utils.CheckOverflow([parameters])
+            overflow = overflow_checker.check_using_norm([norm], reduce_overflow=False)
+        else:
+            overflow_checker = ds_utils.CheckOverflow([parameters])
+            overflow = overflow_checker.check()
+        assert overflow
diff --git a/tests/unit/runtime/utils/test_partition.py b/tests/unit/runtime/utils/test_partition.py
new file mode 100644
index 0000000000000000000000000000000000000000..58b62825de3f1de6e97b9a152bd9e59e19527936
--- /dev/null
+++ b/tests/unit/runtime/utils/test_partition.py
@@ -0,0 +1,197 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import pytest
+
+import torch
+import deepspeed.comm as dist
+
+from deepspeed.runtime.utils import partition_uniform
+from deepspeed.runtime.utils import partition_balanced
+from deepspeed.runtime.utils import prefix_sum_inc
+from deepspeed.runtime.utils import PartitionedTensor
+from deepspeed.accelerator import get_accelerator
+
+from unit.common import DistributedTest
+
+
+class TestPartitionedTensor(DistributedTest):
+    world_size = 4
+
+    def test(self):
+        world = dist.get_world_size()
+        rank = dist.get_rank()
+
+        group = dist.new_group(ranks=list(range(world)))
+
+        rows = world * 4
+        cols = 3
+
+        full = torch.rand(rows, cols).to(get_accelerator().device_name())
+        dist.broadcast(full, src=0, group=group)
+        part = PartitionedTensor(full, group=group)
+
+        assert len(part.local_size()) == 1
+        assert part.local_size()[0] * world == full.numel()
+
+        reconstructed = part.full()
+        assert torch.equal(full, reconstructed)
+
+
+class TestPartitionedTensorMeta(DistributedTest):
+    world_size = 4
+
+    def test(self):
+        world = dist.get_world_size()
+        rank = dist.get_rank()
+
+        group = dist.new_group(ranks=list(range(world)))
+
+        rows = world * 7
+        cols = 3
+
+        full = torch.rand(rows, cols).to(get_accelerator().device_name())
+        dist.broadcast(full, src=0, group=group)
+        part = PartitionedTensor(full, group=group)
+
+        my_meta = PartitionedTensor.from_meta(part.to_meta(), part.local_data, group)
+        assert torch.equal(full, my_meta.full())
+
+
+def assert_valid_partition(weights, parts, P):
+    N = len(weights)
+    assert len(parts) == P + 1
+    assert parts[0] == 0
+    assert parts[P] == N
+    for idx in range(P):
+        assert parts[idx] <= parts[idx + 1]
+
+
+def get_partition_weights(weights, parts):
+    """ Return the amount of weight in each partition. """
+    costs = [0] * (len(parts) - 1)
+    P = len(parts) - 1
+    for p in range(P):
+        start = parts[p]
+        stop = parts[p + 1]
+        costs[p] = sum(weights[start:stop])
+    return costs
+
+
+def test_prefix_sum():
+    x = [3, 4, 5]
+    psum = prefix_sum_inc(x)
+    assert psum == [3, 7, 12]
+
+
+def test_valid_partition():
+    N = 10
+    P = 1
+    weights = [1] * N
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
+
+
+def test_short_partition_uniform():
+    N = 2
+    P = 4
+    weights = [1] * N
+    parts = partition_uniform(len(weights), P)
+    assert_valid_partition(weights, parts, P)
+
+
+def test_short_partition():
+    N = 2
+    P = 4
+    weights = [1] * N
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
+
+
+def test_easy_balance_uniform():
+    weights = [1] * 8
+    P = 4
+    parts = partition_uniform(len(weights), P)
+    assert_valid_partition(weights, parts, P)
+    costs = get_partition_weights(weights, parts)
+    assert all(c == 2 for c in costs)
+
+
+def test_easy_balance_balanced():
+    weights = [1] * 8
+    P = 4
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
+    costs = get_partition_weights(weights, parts)
+    assert all(c == 2 for c in costs), costs
+
+
+def test_int_balanced():
+    weights = [0, 1, 2, 3, 3, 3]
+    P = 4
+    parts = partition_balanced(weights, P)
+    assert parts == [0, 3, 4, 5, 6]
+
+    assert_valid_partition(weights, parts, P)
+    costs = get_partition_weights(weights, parts)
+    assert all(c == 3 for c in costs)
+
+
+def test_float_balanced():
+    weights = [0., 1.1, 1.9, 3., 3., 3.]
+    P = 4
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
+    assert parts == [0, 3, 4, 5, 6]
+
+
+@pytest.mark.skip(reason="Variance-minimizing partitioning returns different result.")
+def test_float_lastheavy():
+    weights = [0., 1.1, 1.9, 3., 30.]
+    P = 2
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
+    assert parts == [0, 4, 5]
+
+
+def test_float_midheavy():
+    weights = [0., 1.1, 30, 3.]
+    P = 3
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
+    assert parts == [0, 2, 3, 4]
+
+
+def test_balance_bert():
+    # Parameters per layer for a transformer model with 24 transformers and hidden dim 1024
+    weights = [
+        52559872,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        0,
+        52559872
+    ]
+    P = 8
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
diff --git a/tests/unit/runtime/zero/test_ignore_unused_parameters.py b/tests/unit/runtime/zero/test_ignore_unused_parameters.py
new file mode 100644
index 0000000000000000000000000000000000000000..efd4949c94606384c59dcf76d1f35e7d140eefdf
--- /dev/null
+++ b/tests/unit/runtime/zero/test_ignore_unused_parameters.py
@@ -0,0 +1,64 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import pytest
+from unit.common import DistributedTest
+from unit.simple_model import UnusedParametersModel, random_dataloader
+from deepspeed.ops.op_builder import CPUAdamBuilder
+
+import deepspeed
+
+
+@pytest.mark.parametrize('ignore_unused_parameters', [False, True])
+class TestStage2IgnoreUnusedParameters(DistributedTest):
+    world_size = 1
+
+    def test(self, ignore_unused_parameters):
+        use_cpu_offload = True
+
+        if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+            pytest.skip("cpu-adam is not compatible")
+
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 2,
+            "gradient_accumulation_steps": 2,
+            "steps_per_print": 1,
+            "zero_optimization": {
+                "stage": 2,
+                "cpu_offload": use_cpu_offload,
+                "ignore_unused_parameters": ignore_unused_parameters
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            }
+        }
+        hidden_dim = 4
+
+        model = UnusedParametersModel(hidden_dim=hidden_dim)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                                  model=model,
+                                                  model_parameters=model.parameters())
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=10,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+
+        def _loop():
+            for n, batch in enumerate(data_loader):
+                loss = model(batch[0], batch[1])
+                model.backward(loss)
+                model.step()
+
+        if ignore_unused_parameters:
+            _loop()
+        else:
+            with pytest.raises(AssertionError) as e:
+                _loop()
+            assert e.value.args and 'ignore_unused_parameters' in e.value.args[0]
diff --git a/tests/unit/runtime/zero/test_zero.py b/tests/unit/runtime/zero/test_zero.py
new file mode 100644
index 0000000000000000000000000000000000000000..5de3ffca27df16cfe0b9f15d66bbcaa3d41d2cf2
--- /dev/null
+++ b/tests/unit/runtime/zero/test_zero.py
@@ -0,0 +1,1386 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import math
+from typing import Dict, List, Set
+import pytest
+import deepspeed.comm as dist
+import torch
+from torch import Tensor
+from torch.nn import Linear, Module
+from torch.nn.modules.container import ModuleList
+from torch.nn.modules.loss import L1Loss
+from torch.nn.parameter import Parameter
+
+from unit.common import DistributedTest
+from unit.simple_model import SimpleModel, random_dataloader
+
+import deepspeed
+from deepspeed.runtime.engine import DeepSpeedEngine
+from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+from deepspeed.accelerator import get_accelerator
+
+
+def run_unbalanced_gradients(model, data_loader):
+    def drop_some_gradients(model, iter):
+        odd_iteration = iter % 2
+        for i, p in enumerate(model.parameters()):
+            p.requires_grad = (i % 2) == odd_iteration
+
+    def enable_grads(model):
+        for p in model.parameters():
+            p.requires_grad = True
+
+    for i, batch in enumerate(data_loader):
+        drop_some_gradients(model, i + 1)
+        loss = model(batch[0], batch[1])
+        model.backward(loss)
+        model.step()
+        enable_grads(model)
+
+
+def dump_state_dict(model):
+    if dist.get_rank() == 0:
+        print("state_dict:")
+        for name, param in model.named_parameters():
+            print(f"{name} {param.data}")
+
+
+@pytest.mark.parametrize('zero_stage', [1, 2, 3])
+class TestZeroUnbalancedGradients(DistributedTest):
+    world_size = 1
+
+    def test(self, zero_stage):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 2,
+            "gradient_accumulation_steps": 2,
+            "steps_per_print": 1,
+            "zero_optimization": {
+                "stage": zero_stage
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            }
+        }
+        hidden_dim = 4
+
+        model = SimpleModel(hidden_dim=hidden_dim)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=16,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+
+        run_unbalanced_gradients(model, data_loader)
+
+
+# testing the fix https://github.com/microsoft/DeepSpeed/pull/1227
+class TestZero3RepeatForwardLoop(DistributedTest):
+    world_size = 1
+
+    def test(self, zero_stage=3):
+        # force all params to be partitioned by forcing threshold=0
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 2,
+            "gradient_accumulation_steps": 2,
+            "steps_per_print": 1,
+            "zero_optimization": {
+                "stage": zero_stage,
+                "stage3_param_persistence_threshold": 0
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            }
+        }
+        hidden_dim = 4
+
+        class AlbertLikeModel(torch.nn.Module):
+            def __init__(self, hidden_dim):
+                super().__init__()
+                self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
+                self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
+
+            def forward(self, x, y):
+                # run the same layer multiple times in a loop - to test a stack of forwards, followed by a stack of backwards
+                hidden = x
+                for i in range(3):
+                    hidden = hidden + self.linear(hidden)
+                return self.cross_entropy_loss(hidden, y)
+
+        model = AlbertLikeModel(hidden_dim=hidden_dim)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=16,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+
+        for i, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+# testing the fix https://github.com/microsoft/DeepSpeed/pull/1227
+# also reproduces the https://github.com/microsoft/DeepSpeed/pull/1372
+@pytest.mark.parametrize('zero_stage', [2, 3])
+class TestZeroToFP32(DistributedTest):
+    world_size = 2
+
+    def test_1_param_group(self, tmpdir, zero_stage):
+        # XXX: ideally refactor with the 2_param_group test as 75% is the same
+        # force all params to be partitioned by forcing threshold=0
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 2,
+            "gradient_accumulation_steps": 2,
+            "steps_per_print": 1,
+            "zero_optimization": {
+                "stage": zero_stage,
+                "stage3_param_persistence_threshold": 0
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            }
+        }
+
+        class MyModel(torch.nn.Module):
+            def __init__(self, hidden_dim, n_layers):
+                super().__init__()
+                # to reproduce https://github.com/microsoft/DeepSpeed/pull/1372 it is important that
+                # the number of total elements is uneven:
+                # (1) 4 layers of 3*(3+1)=12 elements each, 48 in total
+                self.ll = torch.nn.ModuleList(
+                    torch.nn.Linear(hidden_dim,
+                                    hidden_dim) for i in range(n_layers))
+                # (2) the following adds 4+1=5 elements
+                self.classifier = torch.nn.Linear(4, 1)
+                # total 48+5=53 (uneven as desired) elements
+                self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
+
+            def forward(self, x, y):
+                hidden = x
+                for l in self.ll:
+                    hidden = l(hidden)
+                return self.cross_entropy_loss(hidden, y)
+
+        hidden_dim = 3  # do not change
+
+        world_size = dist.get_world_size()
+        # we want at least 2x layers as there are gpus to trigger round_robin_fp16_groups reshuffle in zero2
+        n_layers = world_size * 2
+        model = MyModel(hidden_dim=hidden_dim, n_layers=n_layers)
+
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=16,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+
+        for i, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+        model.save_checkpoint(tmpdir)
+
+        # make sure all sides saved it
+        dist.barrier()
+
+        orig_state_dict = {}
+        for name, param in model.module.named_parameters():
+            if zero_stage == 3:
+                with deepspeed.zero.GatheredParameters(param, modifier_rank=None):
+                    orig_state_dict[name] = param.detach().cpu()
+            else:
+                orig_state_dict[name] = param.detach().cpu()
+
+        if zero_stage == 3:
+            with deepspeed.zero.GatheredParameters(model.parameters(),
+                                                   modifier_rank=None):
+                fp32_model = load_state_dict_from_zero_checkpoint(model.module, tmpdir)
+                fp32_state_dict = fp32_model.state_dict()
+        else:
+            fp32_model = load_state_dict_from_zero_checkpoint(model.module, tmpdir)
+            fp32_state_dict = fp32_model.state_dict()
+
+        #dump_state_dict(fp32_model)
+
+        if dist.get_rank() == 0:
+            for name in orig_state_dict.keys():
+                # float() workaround for torch<1.6
+                assert torch.allclose(orig_state_dict[name].float(),
+                                      fp32_state_dict[name].float())
+
+    def test_2_param_groups(self, tmpdir, zero_stage):
+        # TODO:
+        # - need to test with multiple param groups
+        # force all params to be partitioned by forcing threshold=0
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 2,
+            "gradient_accumulation_steps": 2,
+            "steps_per_print": 1,
+            "zero_allow_untested_optimizer": 1,
+            "zero_optimization": {
+                "stage": zero_stage,
+                "stage3_param_persistence_threshold": 0
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            }
+        }
+
+        class MyModel(torch.nn.Module):
+            def __init__(self, hidden_dim, n_layers):
+                super().__init__()
+                self.ll = torch.nn.ModuleList(
+                    torch.nn.Linear(hidden_dim,
+                                    hidden_dim) for i in range(n_layers))
+                self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
+
+            def forward(self, x, y):
+                hidden = x
+                for l in self.ll:
+                    hidden = l(hidden)
+                return self.cross_entropy_loss(hidden, y)
+
+        hidden_dim = 3
+
+        world_size = dist.get_world_size()
+        n_layers = world_size * 2
+        model = MyModel(hidden_dim=hidden_dim, n_layers=n_layers)
+
+        optim_groups = [
+            {
+                "params": [l.weight for l in model.ll],
+                "weight_decay": 0.01,
+            },
+            {
+                "params": [l.bias for l in model.ll],
+                "weight_decay": 0.0
+            },
+        ]
+        optim = torch.optim.SGD(optim_groups, lr=0.1)
+
+        model, _, _, _ = deepspeed.initialize(model=model,
+                                              model_parameters=model.parameters(),
+                                              optimizer=optim,
+                                              config=config_dict
+        )
+        data_loader = random_dataloader(model=model,
+                                        total_samples=16,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+
+        for i, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+        model.save_checkpoint(tmpdir)
+
+        # make sure all sides saved it
+        dist.barrier()
+
+        #dump_state_dict(model)
+
+        orig_state_dict = {}
+        for name, param in model.module.named_parameters():
+            if zero_stage == 3:
+                with deepspeed.zero.GatheredParameters(param, modifier_rank=None):
+                    orig_state_dict[name] = param.detach().cpu()
+            else:
+                orig_state_dict[name] = param.detach().cpu()
+
+        if zero_stage == 3:
+            with deepspeed.zero.GatheredParameters(model.parameters(),
+                                                   modifier_rank=None):
+                fp32_model = load_state_dict_from_zero_checkpoint(model.module, tmpdir)
+                fp32_state_dict = fp32_model.state_dict()
+        else:
+            fp32_model = load_state_dict_from_zero_checkpoint(model.module, tmpdir)
+            fp32_state_dict = fp32_model.state_dict()
+
+        #dump_state_dict(fp32_model)
+
+        if dist.get_rank() == 0:
+            for name in orig_state_dict.keys():
+                # float() workaround for torch<1.6
+                assert torch.allclose(orig_state_dict[name].float(),
+                                      fp32_state_dict[name].float())
+
+
+@pytest.mark.parametrize("allgather_bucket_size", [1000, 1001])
+class TestIncorectAllgatherBucketSize(DistributedTest):
+    world_size = 1
+
+    def test(self, allgather_bucket_size, zero_stage=2):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 2,
+            "gradient_accumulation_steps": 2,
+            "steps_per_print": 1,
+            "zero_optimization": {
+                "stage": zero_stage,
+                "allgather_bucket_size": allgather_bucket_size
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            }
+        }
+        hidden_dim = 4
+
+        model = SimpleModel(hidden_dim=hidden_dim)
+        if allgather_bucket_size % 2 == 0:
+            model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        else:
+            with pytest.raises(AssertionError) as assertinfo:
+                model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                                  model=model,
+                                                  model_parameters=model.parameters())
+            assert "allgather_bucket_size must be a multiple of nccl_start_alignment_factor" in str(
+                assertinfo)
+
+
+class TestPartitionNcclAlignment(DistributedTest):
+    world_size = 4
+
+    def test(self, zero_stage=2):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 2,
+            "gradient_accumulation_steps": 2,
+            "steps_per_print": 1,
+            "zero_optimization": {
+                "stage": zero_stage
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            }
+        }
+        hidden_dim = 4
+
+        model = SimpleModel(hidden_dim=hidden_dim)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+
+        # get nccl all-gather send buffers alignment factor
+        nccl_start_alignment_factor = model.optimizer.nccl_start_alignment_factor
+
+        parallel_partitioned_bit16_groups = model.optimizer.parallel_partitioned_bit16_groups if zero_stage == 2 else model.optimizer.parallel_partitioned_fp16_groups
+        for data_parallel_partitions in parallel_partitioned_bit16_groups:
+            for partition_id, partitioned_data in enumerate(data_parallel_partitions):
+                # verify that data partition start locations are 4-byte aligned
+                assert (partitioned_data.data_ptr() %
+                        (2 * nccl_start_alignment_factor) == 0)
+
+
+def _ds_initialize_for_param_partitioning_testing(model: Module,
+                                                  cfg: dict) -> DeepSpeedEngine:
+    ds_engine, _, _, _ = deepspeed.initialize(
+        config=cfg,
+        model=model,
+        model_parameters=model.parameters()
+    )
+
+    return ds_engine
+
+
+def _assert_partition_status(model: Module,
+                             valid_statuses: Set[ZeroParamStatus]) -> None:
+    for _, param in model.named_parameters():
+        assert param.ds_status in valid_statuses, param.ds_summary()
+
+
+def _assert_fully_available(model: Module) -> None:
+    for _, param in model.named_parameters():
+        assert param.ds_status == ZeroParamStatus.AVAILABLE
+
+
+class EltwiseMultiplicationModule(Module):
+    def __init__(self, weight: Parameter) -> None:
+        super().__init__()
+        self.weight = weight
+
+    def forward(self, x: Tensor) -> Tensor:
+        _assert_fully_available(self)
+        result = self.weight * x
+
+        return result
+
+
+class EltwiseMultiplicationTestNetwork(Module):
+    """used for testing purposes"""
+    def __init__(
+        self,
+        weight1: Parameter,
+        weight2: Parameter,
+        weight3: Parameter,
+    ) -> None:
+        super().__init__()
+        self.__layer1 = EltwiseMultiplicationModule(weight1)
+        self.__layer2 = EltwiseMultiplicationModule(weight2)
+        self.__layer3 = EltwiseMultiplicationModule(weight3)
+
+        self.loss = L1Loss(reduction="none")
+
+    def forward(self,
+                x: Tensor,
+                y: Tensor,
+                use_module_trace: bool,
+                param_prefetching: bool) -> Dict[str,
+                                                 Tensor]:
+        _assert_partition_status(
+            self,
+            {
+                ZeroParamStatus.NOT_AVAILABLE,
+                ZeroParamStatus.INFLIGHT,
+                ZeroParamStatus.AVAILABLE
+            } if use_module_trace else {ZeroParamStatus.NOT_AVAILABLE})
+
+        pre_layer_expected_states = {
+            ZeroParamStatus.INFLIGHT
+            if param_prefetching else ZeroParamStatus.NOT_AVAILABLE,
+            ZeroParamStatus.AVAILABLE,
+        }
+
+        post_layer_expected_states = {
+            ZeroParamStatus.AVAILABLE
+            if param_prefetching else ZeroParamStatus.NOT_AVAILABLE,
+        }
+
+        _assert_partition_status(self.__layer1, pre_layer_expected_states)
+        hidden1 = self.__layer1(x)
+        _assert_partition_status(self.__layer1, post_layer_expected_states)
+
+        _assert_partition_status(self.__layer2, pre_layer_expected_states)
+        hidden2 = self.__layer2(hidden1)
+        _assert_partition_status(self.__layer2, post_layer_expected_states)
+
+        _assert_partition_status(self.__layer3, pre_layer_expected_states)
+        y_hat = self.__layer3(hidden2)
+        _assert_partition_status(self.__layer3, post_layer_expected_states)
+
+        loss = self.loss(y_hat, y)
+
+        _assert_partition_status(
+            self,
+            {
+                ZeroParamStatus.NOT_AVAILABLE,
+                ZeroParamStatus.INFLIGHT,
+                ZeroParamStatus.AVAILABLE
+            } if use_module_trace else {ZeroParamStatus.NOT_AVAILABLE})
+
+        return {
+            "hidden1": hidden1,
+            "hidden2": hidden2,
+            "y_hat": y_hat,
+            "loss": loss,
+        }
+
+
+@pytest.mark.parametrize("param_persistence_threshold", [0, 10])
+@pytest.mark.parametrize("fp16_enabled", [True, False])
+@pytest.mark.parametrize("contiguous_gradients", [True, False])
+@pytest.mark.parametrize("offload_optimizer", [True, False])
+@pytest.mark.parametrize("zero_grad", [True, False])
+@pytest.mark.parametrize("prefetching", [True, False])
+class TestZero3ParamPartitioningBase(DistributedTest):
+    world_size = 2
+
+    def test(
+        self,
+        param_persistence_threshold: int,
+        fp16_enabled: bool,
+        contiguous_gradients: bool,
+        offload_optimizer: bool,
+        zero_grad: bool,
+        prefetching: bool,
+    ) -> None:
+        if offload_optimizer and not contiguous_gradients:
+            return
+
+        m = 3
+        n = 5
+        weights = [Parameter(torch.zeros((m, n), dtype=torch.float32)) for _ in range(3)]
+        model = EltwiseMultiplicationTestNetwork(*weights)
+        prefetch_bucket_size = sum([p.numel() for p in model.parameters(recurse=True)])
+        cfg = {
+            "train_micro_batch_size_per_gpu": 1,
+            "zero_optimization": {
+                "stage": 3,
+                "stage3_max_reuse_distance": 0,
+                "stage3_param_persistence_threshold": param_persistence_threshold,
+                "contiguous_gradients": contiguous_gradients,
+                "stage3_prefetch_bucket_size": prefetch_bucket_size if prefetching else 0
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1.
+                }
+            },
+            "fp16": {
+                "enabled": fp16_enabled,
+                "loss_scale": 1.,
+            }
+        }
+
+        if offload_optimizer:
+            cfg["zero_optimization"]["offload_optimizer"] = {
+                "device": "cpu",
+                "pin_memory": True,
+            }
+
+        ds_engine = _ds_initialize_for_param_partitioning_testing(model, cfg)
+        for i, weight in enumerate(weights):
+            weight.ds_tensor.data = torch.full_like(weight.ds_tensor.data,
+                                                    (i + 1) * (1 + dist.get_rank()))
+
+        def create_tensor(vals, dtype: torch.dtype = None) -> Tensor:
+            return torch.as_tensor(vals,
+                                   dtype=dtype
+                                   or (torch.float16 if fp16_enabled else torch.float32),
+                                   device=ds_engine.device)
+
+        expected_hidden1 = create_tensor([
+            [1,
+             1,
+             1,
+             1,
+             1],
+            [1,
+             1,
+             1,
+             2,
+             2],
+            [2,
+             2,
+             2,
+             2,
+             2],
+        ])
+        expected_hidden2 = create_tensor([
+            [2,
+             2,
+             2,
+             2,
+             2],
+            [2,
+             2,
+             2,
+             8,
+             8],
+            [8,
+             8,
+             8,
+             8,
+             8],
+        ])
+        expected_yhat = create_tensor([[6,
+                                        6,
+                                        6,
+                                        6,
+                                        6],
+                                       [6,
+                                        6,
+                                        6,
+                                        48,
+                                        48],
+                                       [48,
+                                        48,
+                                        48,
+                                        48,
+                                        48]])
+        expected_loss = create_tensor([
+            [5,
+             5,
+             5,
+             5,
+             5],
+            [5,
+             5,
+             5,
+             47,
+             47],
+            [47,
+             47,
+             47,
+             47,
+             47],
+        ])
+
+        for train_iter in range(3):
+            activations = ds_engine(
+                x=torch.ones((m,
+                              n),
+                             dtype=torch.float16 if fp16_enabled else torch.float32,
+                             device=ds_engine.device),
+                y=torch.ones((m,
+                              n),
+                             dtype=torch.float16 if fp16_enabled else torch.float32,
+                             device=ds_engine.device),
+                use_module_trace=train_iter > 0,
+                param_prefetching=prefetching and train_iter > 0,
+            )
+            assert torch.allclose(activations["hidden1"], expected_hidden1)
+            assert torch.allclose(activations["hidden2"], expected_hidden2)
+            assert torch.allclose(activations["y_hat"], expected_yhat)
+            assert torch.allclose(activations["loss"], expected_loss)
+
+            ds_engine.backward(activations["loss"].sum())
+
+            # check the gradients
+            grad_partitions = ds_engine.optimizer.get_fp32_grad_partitions()
+            assert set(grad_partitions.keys()) == {0}, f"should have one parameter group but got {len(grad_partitions)}"
+            assert set(grad_partitions[0].keys()) == {0, 1, 2}
+            dloss_wrt_layer1 = grad_partitions[0][0]
+            dloss_wrt_layer2 = grad_partitions[0][1]
+            dloss_wrt_layer3 = grad_partitions[0][2]
+
+            assert dloss_wrt_layer1.dtype == torch.float
+            assert dloss_wrt_layer2.dtype == torch.float
+            assert dloss_wrt_layer3.dtype == torch.float
+
+            # layer1 = [..., 1, 2, ...]
+            # layer2 = [..., 2, 4, ...]
+            # layer3 = [..., 3, 6, ...]
+            # dloss_wrt_layer3 = hidden2
+            # dloss_wrt_layer2 = layer3 * hidden1
+            # dloss_wrt_layer1 = layer3 * layer2 * x
+
+            grad_multiplier = 1 if zero_grad else (train_iter + 1)
+            if dist.get_rank() == 0:
+                assert torch.allclose(
+                    dloss_wrt_layer3.to(get_accelerator().device_name()),
+                    grad_multiplier * create_tensor([2] * 8,
+                                                    torch.float))
+                assert torch.allclose(
+                    dloss_wrt_layer2.to(get_accelerator().device_name()),
+                    grad_multiplier * create_tensor([3 * 1] * 8,
+                                                    torch.float))
+                assert torch.allclose(
+                    dloss_wrt_layer1.to(get_accelerator().device_name()),
+                    grad_multiplier * create_tensor([3 * 2 * 1] * 8,
+                                                    torch.float))
+            elif dist.get_rank() == 1:
+                # parameters dont split evenly across ranks so rank 1 has a zero-padded
+                # partition
+                assert torch.allclose(
+                    dloss_wrt_layer3.to(get_accelerator().device_name()),
+                    grad_multiplier * create_tensor(([8] * 7) + [0],
+                                                    torch.float))
+                assert torch.allclose(
+                    dloss_wrt_layer2.to(get_accelerator().device_name()),
+                    grad_multiplier * create_tensor(([6 * 2] * 7) + [0],
+                                                    torch.float))
+                assert torch.allclose(
+                    dloss_wrt_layer1.to(get_accelerator().device_name()),
+                    grad_multiplier * create_tensor(([6 * 4 * 1] * 7) + [0],
+                                                    torch.float))
+            else:
+                raise RuntimeError("test has world size of two")
+
+            if zero_grad:
+                ds_engine.optimizer.zero_grad()
+
+        # TODO. add testing for this - for now we just call it to make sure it
+        # doesn't throw
+        ds_engine.optimizer.step()
+        # taking an optimizer step invalidates all parameters, make sure everything
+        # has been partitioned afterwards
+        _assert_partition_status(ds_engine, {ZeroParamStatus.NOT_AVAILABLE})
+        assert not math.isclose(ds_engine.optimizer._global_grad_norm, 0.0)
+
+
+@pytest.mark.parametrize("init_context_manager", [True, False])
+class TestZero3ParamPartitioningLargeParam(DistributedTest):
+    world_size = 4
+
+    def test(self, init_context_manager: bool, param_sz: int = 8100) -> None:
+        class LargeParamModel(Module):
+            def __init__(self):
+                super().__init__()
+                self.param = Parameter(torch.zeros((param_sz, ), dtype=torch.float32))
+
+                # only do weight initialization on root rank to
+                # make sure we are broadcasting correctly from rank 0
+                if dist.get_rank() == 0:
+                    partition_sz = math.ceil(self.param.numel() / dist.get_world_size())
+                    offset = 0
+                    for rank in range(dist.get_world_size()):
+                        with torch.no_grad():
+                            self.param[offset:offset + partition_sz].fill_(rank)
+                        offset += partition_sz
+
+            def forward(self, x: Tensor) -> Tensor:
+                return x * self.param
+
+        ds_config = {
+            "train_micro_batch_size_per_gpu": 1,
+            "zero_optimization": {
+                "stage": 3,
+                "stage3_max_reuse_distance": 0,
+                "contiguous_gradients": True,
+                "overlap_comm": True,
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1.
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 1.,
+            }
+        }
+        with deepspeed.zero.Init(mem_efficient_linear=False,
+                                 enabled=init_context_manager):
+            model = LargeParamModel()
+        ds_engine = _ds_initialize_for_param_partitioning_testing(model, ds_config)
+
+        for train_iter in range(3):  # test multiple iterations to cover prefetching
+            activation: Tensor = ds_engine(
+                torch.ones(param_sz,
+                           dtype=torch.float16,
+                           device=ds_engine.device))
+
+            partition_sz = math.ceil(param_sz / self.world_size)
+            for rank_idx, start_idx in enumerate(range(0, param_sz, partition_sz)):
+                activation_from_partition = activation[start_idx:start_idx +
+                                                       partition_sz]
+                assert torch.allclose(
+                    activation_from_partition,
+                    torch.full_like(activation_from_partition,
+                                    rank_idx))
+
+            ds_engine.backward(activation.sum())
+            ds_engine.allreduce_gradients()
+
+            avgd_gradients = ds_engine.optimizer.averaged_gradients
+            assert set(avgd_gradients.keys()) == {0}, "should only have one parameter group"
+            weight_gradient, = avgd_gradients[0]
+            expected_weight_gradient = (train_iter + 1) * torch.full_like(
+                weight_gradient,
+                1)
+
+            assert torch.allclose(weight_gradient, expected_weight_gradient)
+
+
+@pytest.mark.parametrize("param_sz", [100, 1_000, 10_000])
+@pytest.mark.parametrize("n_layers", [100, 1_000])
+@pytest.mark.parametrize("init_context_manager", [True, False])
+class TestZero3ParamPartitioningManyParams(DistributedTest):
+    world_size = 4
+
+    def test(self, param_sz: int, n_layers: int, init_context_manager: bool) -> None:
+        class ManyParamModel(Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+                self.modulelist = ModuleList(
+                    EltwiseMultiplicationModule(
+                        weight=Parameter(torch.empty((param_sz,
+                                                      ),
+                                                     dtype=torch.float32)))
+                    for _ in range(n_layers))
+
+                for layer_num, module in enumerate(self.modulelist):
+                    with deepspeed.zero.GatheredParameters(module.weight,
+                                                           modifier_rank=0):
+                        param: Parameter = module.weight
+                        partition_sz = math.ceil(param.numel() / dist.get_world_size())
+                        offset = 0
+                        for rank in range(dist.get_world_size()):
+                            with torch.no_grad():
+                                param[offset:offset + partition_sz].fill_(2 * layer_num *
+                                                                          rank)
+                            offset += partition_sz
+
+            def forward(self, x: Tensor) -> Tensor:
+                activations = []
+
+                for module in self.modulelist:
+                    x = module(x)
+                    activations.append(x)
+
+                return activations
+
+        ds_cfg = {
+            "train_micro_batch_size_per_gpu": 1,
+            "zero_optimization": {
+                "stage": 3,
+                "stage3_max_reuse_distance": 0,
+                "contiguous_gradients": True,
+                "overlap_comm": True,
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1.
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 1.,
+            }
+        }
+
+        with deepspeed.zero.Init(config=ds_cfg,
+                                 mem_efficient_linear=False,
+                                 enabled=init_context_manager):
+            model = ManyParamModel()
+
+        ds_engine = _ds_initialize_for_param_partitioning_testing(model, ds_cfg)
+
+        for _ in range(3):  # test multiple iterations to cover prefetching
+            activations: List[Tensor] = ds_engine(
+                torch.ones((param_sz,
+                            ),
+                           dtype=torch.float16,
+                           device=ds_engine.device))
+            assert len(activations) == n_layers
+
+            partition_sz = math.ceil(param_sz / self.world_size)
+            expected_activations = torch.empty(param_sz,
+                                               dtype=torch.float16,
+                                               device=ds_engine.device)
+            for start_idx in range(0, param_sz, partition_sz):
+                expected_activations[start_idx:start_idx +
+                                     partition_sz] = dist.get_rank()
+
+            for layer_num, activation in enumerate(activations):
+                expected_activations *= 2 * layer_num
+                assert torch.allclose(activation, expected_activations)
+
+            # TODO. finish writing this test
+            ds_engine.backward(activations[-1].sum())
+
+            avgd_gradients = ds_engine.optimizer.averaged_gradients
+            assert set(avgd_gradients.keys()) == {0}, "should only have one parameter group"
+            weight_gradients: List[Tensor] = avgd_gradients[0]
+
+            for layer_num, activation in enumerate(weight_gradients):
+                pass
+
+
+class TestZero3InitForParentWeightInitialization(DistributedTest):
+    world_size = 4
+
+    def test(self):
+        class ModelWhereParentInitializesChildWeights(Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+                self.linear = Linear(12, 1)
+
+                self.apply(self.__init_weights)
+
+            def __init_weights(self, module):
+                if isinstance(module, Linear):
+                    with torch.no_grad():
+                        module.weight.fill_(1 + dist.get_rank())
+
+        ds_cfg = {
+            "train_micro_batch_size_per_gpu": 1,
+            "zero_optimization": {
+                "stage": 3,
+                "stage3_max_reuse_distance": 0,
+                "contiguous_gradients": True,
+                "overlap_comm": True,
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1.
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 1.,
+            }
+        }
+
+        with deepspeed.zero.Init(config=ds_cfg,
+                                 mem_efficient_linear=False,
+                                 enabled=True):
+            model = ModelWhereParentInitializesChildWeights()
+
+        assert model.linear.weight.ds_tensor.numel() == math.ceil(12 / self.world_size)
+        assert torch.allclose(model.linear.weight.ds_tensor,
+                              torch.full_like(model.linear.weight.ds_tensor,
+                                              1))
+
+
+@pytest.mark.skip("not working")
+@pytest.mark.parametrize("param_persistence_threshold", [0, 10])
+@pytest.mark.parametrize("contiguous_gradients", [True, False])
+@pytest.mark.parametrize("offload_optimizer", [True, False])
+@pytest.mark.parametrize("zero_grad", [True, False])
+@pytest.mark.parametrize("prefetching", [True, False])
+class TestZero3ParamPartitioningBaseBF16(DistributedTest):
+    world_size = 2
+
+    def test(
+        self,
+        param_persistence_threshold: int,
+        contiguous_gradients: bool,
+        offload_optimizer: bool,
+        zero_grad: bool,
+        prefetching: bool,
+    ) -> None:
+        if offload_optimizer and not contiguous_gradients:
+            return
+
+        m = 3
+        n = 5
+        weights = [Parameter(torch.zeros((m, n), dtype=torch.float32)) for _ in range(3)]
+        model = EltwiseMultiplicationTestNetwork(*weights)
+        prefetch_bucket_size = sum([p.numel() for p in model.parameters(recurse=True)])
+        cfg = {
+            "train_micro_batch_size_per_gpu": 1,
+            "zero_optimization": {
+                "stage": 3,
+                "stage3_max_reuse_distance": 0,
+                "stage3_param_persistence_threshold": param_persistence_threshold,
+                "contiguous_gradients": contiguous_gradients,
+                "stage3_prefetch_bucket_size": prefetch_bucket_size if prefetching else 0
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1.
+                }
+            },
+            "bf16": {
+                "enabled": True,
+                "loss_scale": 1.,
+            }
+        }
+
+        if offload_optimizer:
+            cfg["zero_optimization"]["offload_optimizer"] = {
+                "device": "cpu",
+                "pin_memory": True,
+            }
+
+        ds_engine = _ds_initialize_for_param_partitioning_testing(model, cfg)
+        for i, weight in enumerate(weights):
+            weight.ds_tensor.data = torch.full_like(weight.ds_tensor.data,
+                                                    (i + 1) * (1 + dist.get_rank()))
+
+        def create_tensor(vals):
+            return torch.as_tensor(vals, dtype=torch.bfloat16, device=ds_engine.device)
+
+        expected_hidden1 = create_tensor([
+            [1,
+             1,
+             1,
+             1,
+             1],
+            [1,
+             1,
+             1,
+             2,
+             2],
+            [2,
+             2,
+             2,
+             2,
+             2],
+        ])
+        expected_hidden2 = create_tensor([
+            [2,
+             2,
+             2,
+             2,
+             2],
+            [2,
+             2,
+             2,
+             8,
+             8],
+            [8,
+             8,
+             8,
+             8,
+             8],
+        ])
+        expected_yhat = create_tensor([[6,
+                                        6,
+                                        6,
+                                        6,
+                                        6],
+                                       [6,
+                                        6,
+                                        6,
+                                        48,
+                                        48],
+                                       [48,
+                                        48,
+                                        48,
+                                        48,
+                                        48]])
+        expected_loss = create_tensor([
+            [5,
+             5,
+             5,
+             5,
+             5],
+            [5,
+             5,
+             5,
+             47,
+             47],
+            [47,
+             47,
+             47,
+             47,
+             47],
+        ])
+
+        for train_iter in range(3):
+            _assert_partition_status(ds_engine, {ZeroParamStatus.NOT_AVAILABLE})
+            activations = ds_engine(
+                x=torch.ones((m,
+                              n),
+                             dtype=torch.bfloat16,
+                             device=ds_engine.device),
+                y=torch.ones((m,
+                              n),
+                             dtype=torch.bfloat16,
+                             device=ds_engine.device),
+                use_module_trace=train_iter > 0,
+                param_prefetching=prefetching and train_iter > 0,
+            )
+            assert torch.allclose(activations["hidden1"], expected_hidden1)
+            assert torch.allclose(activations["hidden2"], expected_hidden2)
+            assert torch.allclose(activations["y_hat"], expected_yhat)
+            assert torch.allclose(activations["loss"], expected_loss)
+
+            ds_engine.backward(activations["loss"].sum())
+            _assert_partition_status(ds_engine, {ZeroParamStatus.NOT_AVAILABLE})
+
+            # check the gradients
+            grad_partitions = ds_engine.optimizer.get_fp32_grad_partitions()
+            assert set(grad_partitions.keys()) == {0}, f"should have one parameter group but got {len(grad_partitions)}"
+            assert set(grad_partitions[0].keys()) == {0, 1, 2}
+            dloss_wrt_layer1 = grad_partitions[0][0]
+            dloss_wrt_layer2 = grad_partitions[0][1]
+            dloss_wrt_layer3 = grad_partitions[0][2]
+
+            # layer1 = [..., 1, 2, ...]
+            # layer2 = [..., 2, 4, ...]
+            # layer3 = [..., 3, 6, ...]
+            # dloss_wrt_layer3 = hidden2
+            # dloss_wrt_layer2 = layer3 * hidden1
+            # dloss_wrt_layer1 = layer3 * layer2 * x
+
+            expected_grad_dtype = torch.float32 if offload_optimizer else torch.bfloat16
+
+            grad_multiplier = 1 if zero_grad else (train_iter + 1)
+            if dist.get_rank() == 0:
+                assert torch.allclose(
+                    dloss_wrt_layer3.to(get_accelerator().device_name()),
+                    grad_multiplier * create_tensor([2] * 8).to(expected_grad_dtype))
+                assert torch.allclose(
+                    dloss_wrt_layer2.to(get_accelerator().device_name()),
+                    grad_multiplier * create_tensor([3 * 1] * 8).to(expected_grad_dtype))
+                assert torch.allclose(
+                    dloss_wrt_layer1.to(get_accelerator().device_name()),
+                    grad_multiplier *
+                    create_tensor([3 * 2 * 1] * 8).to(expected_grad_dtype))
+            elif dist.get_rank() == 1:
+                # parameters dont split evenly across ranks so rank 1 has a zero-padded
+                # partition
+                assert torch.allclose(
+                    dloss_wrt_layer3.to(get_accelerator().device_name()),
+                    grad_multiplier *
+                    create_tensor(([8] * 7) + [0]).to(expected_grad_dtype))
+                assert torch.allclose(
+                    dloss_wrt_layer2.to(get_accelerator().device_name()),
+                    grad_multiplier *
+                    create_tensor(([6 * 2] * 7) + [0]).to(expected_grad_dtype))
+                assert torch.allclose(
+                    dloss_wrt_layer1.to(get_accelerator().device_name()),
+                    grad_multiplier *
+                    create_tensor(([6 * 4 * 1] * 7) + [0]).to(expected_grad_dtype))
+            else:
+                raise RuntimeError("test has world size of two")
+
+            if zero_grad:
+                ds_engine.optimizer.zero_grad()
+
+        # TODO. add testing for this - for now we just call it to make sure it
+        # doesn't throw
+        ds_engine.optimizer.step()
+        _assert_partition_status(ds_engine, {ZeroParamStatus.NOT_AVAILABLE})
+
+
+class TestZeroOffloadStage1(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        config_dict = {
+            "train_batch_size": 4,
+            "gradient_accumulation_steps": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-4
+                }
+            },
+            "fp16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": 1,
+                "offload_optimizer": {
+                    "device": "cpu"
+                }
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(model=model,
+                                              model_parameters=model.parameters(),
+                                              config=config_dict)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        dist.barrier()
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+@pytest.mark.parametrize('return_type', [tuple, list, dict])
+class TestZero3DictFwd(DistributedTest):
+    world_size = 1
+
+    def test(self, return_type):
+        config_dict = {
+            "train_batch_size": 4,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-4
+                }
+            },
+            "fp16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": 3
+            }
+        }
+        hidden_dim = 10
+
+        class MyModel(torch.nn.Module):
+            def __init__(self, hidden_dim):
+                super(MyModel, self).__init__()
+                self.l1 = torch.nn.Linear(hidden_dim, hidden_dim)
+                self.cel = torch.nn.CrossEntropyLoss()
+
+            def forward(self, x, y):
+                x = self.l1(x)
+                loss = self.cel(x, y)
+                if return_type == dict:
+                    val = {'a': x, 'loss': loss, 'b': 1, 'c': None}
+                elif return_type == list:
+                    val = [x, loss]
+                elif return_type == tuple:
+                    val = (x, loss)
+                else:
+                    raise NotImplementedError
+                return val
+
+        with deepspeed.zero.Init():
+            model = MyModel(hidden_dim)
+
+        model, _, _, _ = deepspeed.initialize(model=model,
+                                              model_parameters=model.parameters(),
+                                              config=config_dict)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        dist.barrier()
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            if return_type == dict:
+                loss = loss['loss']
+            else:
+                loss = loss[1]
+            model.backward(loss)
+            model.step()
+
+
+@pytest.mark.parametrize('zero_stage', [1, 2, 3])
+class TestZeroAdamOptimizerStepCount(DistributedTest):
+    world_size = 1
+
+    def test(self, zero_stage):
+        # force all params to be partitioned by forcing threshold=0
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 2,
+            "gradient_accumulation_steps": 2,
+            "steps_per_print": 1,
+            "zero_optimization": {
+                "stage": zero_stage,
+                "stage3_param_persistence_threshold": 0,
+                "sub_group_size": 4,
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            }
+        }
+        hidden_dim = 4
+
+        model = SimpleModel(hidden_dim=hidden_dim, nlayers=12)
+        model, optimizer, _, _ = deepspeed.initialize(config=config_dict,
+                                                      model=model,
+                                                      model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=16,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+
+        for i, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+            step_counts = []
+            if zero_stage == 3:
+                for sub_group_id, _ in enumerate(optimizer.fp16_groups):
+                    fp32_param = optimizer.fp32_partitioned_groups_flat[sub_group_id]
+                    state = optimizer.optimizer.state[fp32_param]
+                    step_counts.append(state['step'])
+                assert all(step == step_counts[0] for step in step_counts)
+            elif zero_stage == 1 or zero_stage == 2:
+                for param_group in optimizer.optimizer.param_groups:
+                    for param in param_group['params']:
+                        state = optimizer.optimizer.state[param]
+                        step_counts.append(state['step'])
+                assert all(step == step_counts[0] for step in step_counts)
+
+
+class TestZeroFrozenWeights(DistributedTest):
+    world_size = 1
+
+    def test(self):
+        config_dict = {
+            "train_batch_size": 4,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-4
+                }
+            },
+            "fp16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": 3
+            }
+        }
+        hidden_dim = 10
+
+        class MyModel(torch.nn.Module):
+            def __init__(self, hidden_dim):
+                super(MyModel, self).__init__()
+                self.l1 = torch.nn.Linear(hidden_dim, hidden_dim)
+                self.l2 = torch.nn.Linear(hidden_dim, hidden_dim)
+                self.act = torch.nn.ReLU()
+                self.cel = torch.nn.CrossEntropyLoss()
+
+                # freeze one fc
+                self.l2.weight.requires_grad = False
+                self.l2.bias.requires_grad = False
+
+            def forward(self, x, y):
+                x = self.l1(x)
+                x = self.act(x)
+                x = self.l2(x)
+                loss = self.cel(x, y)
+                val = (x, loss)
+                return val
+
+        with deepspeed.zero.Init(config_dict_or_path=config_dict):
+            model = MyModel(hidden_dim)
+
+        model, _, _, _ = deepspeed.initialize(model=model,
+                                              model_parameters=model.parameters(),
+                                              config=config_dict)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        dist.barrier()
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            loss = loss[1]
+            model.backward(loss)
+            model.step()
diff --git a/tests/unit/runtime/zero/test_zero_config.py b/tests/unit/runtime/zero/test_zero_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..84852ec2e6f812a8ae393413dfc8867766e5254d
--- /dev/null
+++ b/tests/unit/runtime/zero/test_zero_config.py
@@ -0,0 +1,74 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from deepspeed.runtime.zero.config import DeepSpeedZeroConfig, DeepSpeedZeroOffloadParamConfig, DeepSpeedZeroOffloadOptimizerConfig
+
+
+def test_zero_config_deprecatedfields():
+    config = DeepSpeedZeroConfig(**{"cpu_offload_param": True})
+    assert isinstance(config.offload_param, DeepSpeedZeroOffloadParamConfig)
+
+    config = DeepSpeedZeroConfig(**{"cpu_offload": True})
+    assert isinstance(config.offload_optimizer, DeepSpeedZeroOffloadOptimizerConfig)
+
+    config = DeepSpeedZeroConfig(**{"stage3_gather_fp16_weights_on_model_save": True})
+    assert config.gather_16bit_weights_on_model_save == True
+
+
+def test_zero_config_aliasfields():
+    config = DeepSpeedZeroConfig(**{"stage3_prefetch_bucket_size": 12345})
+    assert config.prefetch_bucket_size == 12345
+
+    config = DeepSpeedZeroConfig(**{"stage3_param_persistence_threshold": 12345})
+    assert config.param_persistence_threshold == 12345
+
+    config = DeepSpeedZeroConfig(**{"stage3_max_reuse_distance": 12345})
+    assert config.max_reuse_distance == 12345
+
+    config = DeepSpeedZeroConfig(**{"stage3_gather_16bit_weights_on_model_save": True})
+    assert config.gather_16bit_weights_on_model_save == True
+
+
+def test_zero_config_overlapcomm():
+    for stage in [0, 1, 2]:
+        config = DeepSpeedZeroConfig(**{"stage": stage})
+        assert config.overlap_comm == False
+
+    config = DeepSpeedZeroConfig(**{"stage": 3})
+    assert config.overlap_comm == True
+
+
+def test_zero_config_offload_configs():
+    config = DeepSpeedZeroConfig()
+    assert config.offload_param == None
+    assert config.offload_optimizer == None
+
+    config = DeepSpeedZeroConfig(**{"offload_param": None, "offload_optimizer": None})
+    assert config.offload_param == None
+    assert config.offload_optimizer == None
+
+    config = DeepSpeedZeroConfig(**{"offload_param": {}, "offload_optimizer": {}})
+    assert isinstance(config.offload_param, DeepSpeedZeroOffloadParamConfig)
+    assert isinstance(config.offload_optimizer, DeepSpeedZeroOffloadOptimizerConfig)
+
+
+def test_zero_offload_optimizer_config_pipeline():
+    config = DeepSpeedZeroOffloadOptimizerConfig()
+    assert config.pipeline == False
+
+    config = DeepSpeedZeroOffloadOptimizerConfig(**{
+        "pipeline_read": True,
+        "pipeline_write": False
+    })
+    assert config.pipeline == True
+
+    config = DeepSpeedZeroOffloadOptimizerConfig(**{
+        "pipeline_read": False,
+        "pipeline_write": True
+    })
+    assert config.pipeline == True
+
+    config = DeepSpeedZeroOffloadOptimizerConfig(**{
+        "pipeline_read": True,
+        "pipeline_write": True
+    })
+    assert config.pipeline == True
diff --git a/tests/unit/runtime/zero/test_zero_context.py b/tests/unit/runtime/zero/test_zero_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..a88db44888efd35546ff09f38bcac27f44f5040d
--- /dev/null
+++ b/tests/unit/runtime/zero/test_zero_context.py
@@ -0,0 +1,269 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from types import SimpleNamespace
+
+import torch
+import deepspeed
+from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus, partitioned_param_data_shape
+import deepspeed.comm as dist
+
+from unit.common import DistributedTest
+from unit.simple_model import SimpleModel
+from utils import setup_serial_env
+
+
+# Test that no sub-class or super-class is missed
+class ConvX(torch.nn.Conv1d):
+    def __init__(self, *args):
+        super().__init__(*args)
+        # This would not be partitioned before bugfix 5ca8167
+        self.param_in = torch.nn.Parameter(torch.FloatTensor(5).uniform_())
+
+    def forward(self, x):
+        return x
+
+
+class ConvNet(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = ConvX(1, 3, 4)
+        self.param = torch.nn.Parameter(torch.FloatTensor(5).uniform_())
+
+    def forward(self, x):
+        return x
+
+
+config = {
+    "train_batch_size": 1,
+    "steps_per_print": 1,
+    "optimizer": {
+        "type": "Adam",
+        "params": {
+            "lr": 0.00015
+        }
+    },
+    "fp16": {
+        "enabled": True,
+        "loss_scale": 138.
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "stage3_param_persistence_threshold": 1,
+    }
+}
+
+
+class TestZeroGatheredParametersFree(DistributedTest):
+    world_size = 1
+
+    def test(self):
+        config_dict = {"train_batch_size": 1, "zero_optimization": {"stage": 3}}
+        hidden_dim = 10
+
+        class MyModel(torch.nn.Module):
+            def __init__(self, hidden_dim):
+                super(MyModel, self).__init__()
+                self.l1 = torch.nn.Linear(hidden_dim, hidden_dim)
+
+        with deepspeed.zero.Init(config_dict_or_path=config_dict):
+            model = MyModel(hidden_dim)
+
+        with deepspeed.zero.GatheredParameters(list(model.parameters())):
+            assert model.l1.weight.numel() != 0, "GatheredParameters should give a non-0-sized tensor"
+
+        # on exit from `GatheredParameters` the gathered params should be freed and not leak memory
+        assert model.l1.weight.numel() == 0, "outside of GatheredParameters the param should go back to be 0-sized"
+
+
+class TestSerialContext(DistributedTest):
+    world_size = 1
+    init_distributed = False
+    set_dist_env = False
+
+    def test_subclass_param(self):
+        setup_serial_env()
+        with deepspeed.zero.Init(config=config):
+            model = ConvNet()
+
+        assert model.param.ds_status == ZeroParamStatus.NOT_AVAILABLE
+        assert model.conv1.param_in.ds_status == ZeroParamStatus.NOT_AVAILABLE
+
+    def test_scattered_init_dist(self):
+        setup_serial_env()
+        assert not dist.is_initialized()
+        with deepspeed.zero.Init():
+            assert dist.is_initialized()
+
+    def test_scatter_halftype(self):
+        setup_serial_env()
+
+        with deepspeed.zero.Init():
+            l = torch.nn.Linear(10, 10)
+            assert l.weight.ds_tensor.dtype == torch.float16
+
+            y = torch.LongTensor([3, 3])
+            assert y.dtype == torch.long
+
+    def test_throughput_calculation(self):
+        setup_serial_env()
+
+        train_micro_batch_size_per_gpu = 7
+        gradient_accumulation_steps = 6
+        config_dict = {
+            "train_micro_batch_size_per_gpu": train_micro_batch_size_per_gpu,
+            "gradient_accumulation_steps": gradient_accumulation_steps,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.001,
+                }
+            },
+            "zero_optimization": {
+                "stage": 0
+            },
+        }
+
+        args = SimpleNamespace(local_rank=0)
+        net = SimpleModel(hidden_dim=4)
+        engine, _, _, _ = deepspeed.initialize(args=args,
+                                            config=config_dict,
+                                            model=net,
+                                            model_parameters=net.parameters())
+        assert engine.tput_timer.batch_size == train_micro_batch_size_per_gpu * gradient_accumulation_steps
+
+        assert not engine.tput_timer.initialized
+        assert not engine.tput_timer.started
+        assert engine.tput_timer.start_step == 2
+        assert engine.tput_timer.start_time == 0
+        assert engine.tput_timer.micro_step_count == 0
+        assert engine.tput_timer.global_step_count == 0
+        assert engine.tput_timer.total_elapsed_time == 0
+
+        # calling stop() while uninitialized - has no effect
+        engine.tput_timer.stop()
+        assert not engine.tput_timer.initialized
+        assert not engine.tput_timer.started
+        assert engine.tput_timer.start_time == 0
+        assert engine.tput_timer.micro_step_count == 0
+        assert engine.tput_timer.global_step_count == 0
+        assert engine.tput_timer.total_elapsed_time == 0
+
+        # any call to start() (from dataloader or not) initializes the timer
+        engine.tput_timer.start()
+        assert engine.tput_timer.initialized
+        assert engine.tput_timer.started
+        assert engine.tput_timer.start_time == 0
+        assert engine.tput_timer.micro_step_count == 0
+        assert engine.tput_timer.global_step_count == 0
+        assert engine.tput_timer.total_elapsed_time == 0
+
+        # calling stop() after initialized - increments the local micro step counter
+        engine.tput_timer.stop()
+        assert engine.tput_timer.initialized
+        assert not engine.tput_timer.started
+        assert engine.tput_timer.start_time == 0
+        assert engine.tput_timer.micro_step_count == 1
+        assert engine.tput_timer.global_step_count == 0
+        assert engine.tput_timer.total_elapsed_time == 0
+
+        # calling start()/stop() to increment the step counter until start_step
+        while engine.tput_timer.micro_step_count < (gradient_accumulation_steps *
+                                                    engine.tput_timer.start_step):
+            engine.tput_timer.start()
+            global_step = (engine.tput_timer.micro_step_count +
+                           1) % gradient_accumulation_steps == 0
+            engine.tput_timer.stop(global_step=global_step)
+        assert engine.tput_timer.global_step_count == engine.tput_timer.start_step
+        assert engine.tput_timer.total_elapsed_time == 0
+
+        # calling start()/stop() accumulates duration during gradient accumulation
+        while engine.tput_timer.global_step_count == engine.tput_timer.start_step:
+            engine.tput_timer.start()
+            current_duration = engine.tput_timer.step_elapsed_time
+            total_duration = engine.tput_timer.total_elapsed_time
+
+            global_step = (engine.tput_timer.micro_step_count +
+                           1) % gradient_accumulation_steps == 0
+            engine.tput_timer.stop(global_step=global_step)
+            duration = engine.tput_timer.end_time - engine.tput_timer.start_time
+            # step elapsed time is reset after gradient accumulation steps
+            assert engine.tput_timer.step_elapsed_time == (
+                0 if engine.tput_timer.global_step_count != engine.tput_timer.start_step
+                else current_duration + duration)
+            assert engine.tput_timer.total_elapsed_time == total_duration + duration
+
+    def test_ext_param_getattr(self):
+        setup_serial_env()
+
+        class ExtLinear(torch.nn.Module):
+            def __init__(self, dim=16):
+                super().__init__()
+                self.dim = dim
+                self.linear1 = torch.nn.Linear(dim, dim)
+                self.linear2 = torch.nn.Linear(dim, dim)
+
+            def forward(self, input):
+                A = self.linear1(input)
+                B = self.linear2(A)
+
+                # external use of self.linear1.weight
+                C = torch.nn.functional.linear(B, self.linear1.weight)
+                return C.sum()
+
+        net = ExtLinear()
+
+        args = SimpleNamespace(local_rank=0)
+        engine, optim, _, _ = deepspeed.initialize(args=args,
+                                                model=net,
+                                                model_parameters=net.parameters(),
+                                                config=config)
+
+        with deepspeed.zero.GatheredParameters(net.linear1.weight):
+            assert net.linear1.weight.numel() == net.dim**2
+
+        input = torch.rand(net.dim).to(engine.device).half()
+        loss = engine(input)
+        engine.backward(loss)
+        engine.step()
+
+
+class TestScatterGather(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        with deepspeed.zero.Init():
+            l = torch.nn.Linear(6, 3)
+        assert l.weight.ds_status == ZeroParamStatus.NOT_AVAILABLE
+        assert l.weight.shape == torch.Size(partitioned_param_data_shape)
+
+        # Ensure there is no impact outside the context
+        l2 = torch.nn.Linear(6, 3)
+        assert not hasattr(l2.weight, 'ds_status')
+        assert l2.weight.numel() == l2.in_features * l2.out_features
+
+        with deepspeed.zero.GatheredParameters(l.weight):
+            assert l.weight.ds_status == ZeroParamStatus.AVAILABLE
+            assert l.weight.numel() == l.in_features * l.out_features
+
+
+class TestGatherUpdate(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        with deepspeed.zero.Init():
+            l = torch.nn.Linear(4, 2)
+        assert l.weight.ds_status == ZeroParamStatus.NOT_AVAILABLE
+
+        # Gather and make a change
+        with deepspeed.zero.GatheredParameters(l.weight, modifier_rank=1):
+            assert l.weight.ds_status == ZeroParamStatus.AVAILABLE
+            if dist.get_rank() == 1:
+                with torch.no_grad():
+                    l.weight.zero_()
+
+        # should now be scattered again
+
+        # Now gather again and ensure the change is global
+        with deepspeed.zero.GatheredParameters(l.weight):
+            # all ranks compare
+            assert torch.equal(l.weight, torch.zeros_like(l.weight))
diff --git a/tests/unit/runtime/zero/test_zero_context_ancestry.py b/tests/unit/runtime/zero/test_zero_context_ancestry.py
new file mode 100644
index 0000000000000000000000000000000000000000..38ae524906d5acbcf366773ec93ec028e3a2b77f
--- /dev/null
+++ b/tests/unit/runtime/zero/test_zero_context_ancestry.py
@@ -0,0 +1,111 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import deepspeed
+from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+from deepspeed.accelerator import get_accelerator
+
+from utils import setup_serial_env
+from unit.common import DistributedTest
+
+config = {
+    "train_batch_size": 1,
+    "steps_per_print": 1,
+    "optimizer": {
+        "type": "Adam",
+        "params": {
+            "lr": 0.00015
+        }
+    },
+    "fp16": {
+        "enabled": True,
+        "loss_scale": 138.
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "stage3_param_persistence_threshold": 1,
+    }
+}
+
+
+# test that sub-classes get params that aren't prematurely partitioned and thus requiring gathering
+# fixed by https://github.com/microsoft/DeepSpeed/pull/1202
+class GrandPa(torch.nn.Module):
+    def __init__(self, *args):
+        super().__init__(*args)
+        self.param_grandpa = torch.nn.Parameter(torch.ones(5))
+        self.param_grandpa.data = (self.param_grandpa.data +
+                                   1).data  # test param is not yet partitioned
+
+
+class Pa(GrandPa):
+    def __init__(self, *args):
+        super().__init__(*args)
+        self.param_pa = torch.nn.Parameter(torch.ones(5))
+        self.param_pa.data = (self.param_pa.data +
+                              1).data  # test param is not yet partitioned
+        self.param_grandpa.data = (self.param_grandpa.data +
+                                   1).data  # test param is not yet partitioned
+
+
+class Son(Pa):
+    def __init__(self):
+        super().__init__()
+        self.param = torch.nn.Parameter(torch.ones(5))
+        self.param.data = (self.param.data + 1).data  # test param is not yet partitioned
+        self.param_pa.data = (self.param_pa.data +
+                              1).data  # test param is not yet partitioned
+        self.param_grandpa.data = (self.param_grandpa.data +
+                                   1).data  # test param is not yet partitioned
+
+
+class TestSerialParamInit(DistributedTest):
+    world_size = 1
+    init_distributed = False
+    set_dist_env = False
+
+    def test_subclass_param_init(self):
+        setup_serial_env()
+        with deepspeed.zero.Init(config=config):
+            model = Son().cpu()
+
+        # test that all params have been partitioned
+        assert model.param_grandpa.ds_status == ZeroParamStatus.NOT_AVAILABLE
+        assert model.param_pa.ds_status == ZeroParamStatus.NOT_AVAILABLE
+        assert model.param.ds_status == ZeroParamStatus.NOT_AVAILABLE
+
+        # test that the weights manipulation during each __init__ worked in all w/o needing gathering
+        ones = torch.ones(5).half().to(get_accelerator().device_name())
+        with deepspeed.zero.GatheredParameters(list(model.parameters(recurse=False))):
+            assert torch.equal(model.param, ones + 1)
+            assert torch.equal(model.param_pa, ones + 2)
+            assert torch.equal(model.param_grandpa, ones + 3)
+
+
+class TestDSInitWZinit(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        ds_config = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            }
+        }
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super(Model, self).__init__()
+                self.linear = torch.nn.Linear(4, 4)
+
+            def magic(self):
+                return 42
+
+        with deepspeed.zero.Init():
+            model = Model()
+            engine, *_ = deepspeed.initialize(model=model, config=ds_config, model_parameters=model.parameters())
+        assert engine.magic() == 42
diff --git a/tests/unit/runtime/zero/test_zero_context_return.py b/tests/unit/runtime/zero/test_zero_context_return.py
new file mode 100644
index 0000000000000000000000000000000000000000..68329cb886c277849319046a59eef6cd022b5d2a
--- /dev/null
+++ b/tests/unit/runtime/zero/test_zero_context_return.py
@@ -0,0 +1,184 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from types import SimpleNamespace
+import torch
+import pytest
+import deepspeed
+from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+
+from utils import setup_serial_env
+from unit.common import DistributedTest
+
+
+class DanglingBias(torch.nn.Linear):
+    def forward(self, *inputs):
+        out = super().forward(*inputs)
+        # return the bias to trigger a dangling external param
+        return out, self.bias
+
+
+class DataClass:
+    """Just wraps data in an object. """
+    def __init__(self, out=None, bias=None):
+        self.out = out
+        self.bias = bias
+
+
+class DanglingBiasClass(DanglingBias):
+    def forward(self, *inputs):
+        out, bias = super().forward(*inputs)
+        return DataClass(out=out, bias=bias)
+
+
+class DanglingAttention(torch.nn.Linear):
+    def __init__(self, dim=16, return_obj=False):
+        super().__init__(dim, dim)
+        self.dim = dim
+        self.return_obj = return_obj
+        if return_obj:
+            self.d_linear = DanglingBiasClass(dim, dim)
+        else:
+            self.d_linear = DanglingBias(dim, dim)
+
+    def forward(self, input):
+        out = super().forward(input)
+        if self.return_obj:
+            out_obj = self.d_linear(out)
+            assert out_obj.bias.ds_status == ZeroParamStatus.AVAILABLE
+            # forward the external param
+            return out_obj.out, out_obj.bias
+        else:
+            out, bias = self.d_linear(out)
+            assert hasattr(bias, 'ds_status') or hasattr(bias, 'ds_param_alias')
+            z3_bias = bias if hasattr(bias, 'ds_status') else bias.ds_param_alias
+            assert z3_bias.ds_status == ZeroParamStatus.AVAILABLE
+            return out, bias
+
+
+class ModelContainer(torch.nn.Module):
+    def __init__(self, dim=16, return_obj=False):
+        super().__init__()
+        self.dim = dim
+        self.linear1 = torch.nn.Linear(dim, dim)
+        self.dangler = DanglingAttention(dim, return_obj=return_obj)
+
+    def forward(self, input):
+        act1 = self.linear1(input)
+        # bias is actually dangler.d_linear1.bias
+        act2, bias = self.dangler(act1)
+        return (act2 + bias).sum()
+
+
+class DanglingExt(torch.nn.Module):
+    def __init__(self, dim=16):
+        super().__init__()
+        self.dim = dim
+        self.container = ModelContainer(dim)
+
+    def forward(self, input):
+        out = self.container(input)
+
+        # Make sure it's at the right level of the stack
+        assert len(self._external_params) == 0
+        assert len(self.container._external_params) == 1
+        assert len(self.container.dangler._external_params) == 0
+        return out
+
+
+class ModelContainerVariableOutputType(ModelContainer):
+    def __init__(self, dim=16, output_type=dict):
+        super().__init__()
+        self.output_type = output_type
+        self.dim = dim
+        self.linear1 = torch.nn.Linear(dim, dim)
+
+    def forward(self, input):
+        act1 = self.linear1(input)
+        if self.output_type is dict:
+            return {'loss': act1.sum()}
+        if self.output_type is torch.tensor:
+            return act1.sum()
+
+
+config = {
+    "train_batch_size": 1,
+    "steps_per_print": 1,
+    "optimizer": {
+        "type": "Adam",
+        "params": {
+            "lr": 0.00015
+        }
+    },
+    "fp16": {
+        "enabled": True,
+        "loss_scale": 138.
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "stage3_param_persistence_threshold": 1,
+    }
+}
+
+
+class TestReturnParam(DistributedTest):
+    world_size = 1
+
+    def test_ext_param_return(self):
+        setup_serial_env()
+
+        net = DanglingExt()
+
+        args = SimpleNamespace(local_rank=0)
+        engine, _, _, _ = deepspeed.initialize(args=args,
+                                                model=net,
+                                                model_parameters=net.parameters(),
+                                                config=config)
+
+        for _ in range(5):
+            input = torch.rand(net.dim).to(engine.device).half()
+            loss = engine(input)
+            engine.backward(loss)
+            engine.step()
+
+    @pytest.mark.skip('WIP')
+    def test_ext_param_returnobj(self):
+        setup_serial_env()
+        print()
+
+        net = ModelContainer(return_obj=True)
+
+        args = SimpleNamespace(local_rank=0)
+        engine, _, _, _ = deepspeed.initialize(args=args,
+                                                model=net,
+                                                model_parameters=net.parameters(),
+                                                config=config)
+
+        for _ in range(5):
+            input = torch.rand(net.dim).to(engine.device).half()
+            loss = engine(input)
+            assert len(net._external_params) == 1
+            assert len(net.dangler._external_params) == 0
+            engine.backward(loss)
+            engine.step()
+
+    @pytest.mark.parametrize('output_type', [torch.tensor, dict, None])
+    def test_stage_3_output_type(self, output_type):
+        setup_serial_env()
+        print()
+
+        net = ModelContainerVariableOutputType(output_type=output_type)
+
+        args = SimpleNamespace(local_rank=0)
+        engine, _, _, _ = deepspeed.initialize(args=args,
+                                                model=net,
+                                                model_parameters=net.parameters(),
+                                                config=config)
+
+        for _ in range(1):
+            input = torch.rand(net.dim).to(engine.device).half()
+            loss = engine(input)
+            if loss is not None:
+                if isinstance(loss, dict):
+                    loss = loss['loss']
+                engine.backward(loss)
+                engine.step()
diff --git a/tests/unit/runtime/zero/test_zero_tensor_fragment.py b/tests/unit/runtime/zero/test_zero_tensor_fragment.py
new file mode 100644
index 0000000000000000000000000000000000000000..20caf05dd9d5b1d623af38fa8619d6068b502d47
--- /dev/null
+++ b/tests/unit/runtime/zero/test_zero_tensor_fragment.py
@@ -0,0 +1,156 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import pytest
+import deepspeed.comm as dist
+import torch
+
+from unit.common import DistributedTest
+from unit.simple_model import random_dataloader
+from unit.util import bf16_required_version_check
+
+import deepspeed
+from deepspeed.utils import safe_get_full_fp32_param, safe_get_full_grad, safe_get_full_optimizer_state
+from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
+from deepspeed.ops.aio import AsyncIOBuilder
+
+
+def validate_full_tensors(model):
+    for _, lp in model.named_parameters():
+        hp = safe_get_full_fp32_param(lp)
+        exp_avg = safe_get_full_optimizer_state(lp, 'exp_avg')
+        exp_avg_sq = safe_get_full_optimizer_state(lp, 'exp_avg_sq')
+        hp_grad = safe_get_full_grad(lp)
+        param_list = [hp, hp_grad, exp_avg, exp_avg_sq]
+        if lp.requires_grad:
+            assert all([p is not None for p in param_list])
+        else:
+            assert all([p is None for p in param_list])
+
+
+class MyModel(torch.nn.Module):
+    def __init__(self, hidden_dim, frozen_weights):
+        super(MyModel, self).__init__()
+        self.act = torch.nn.ReLU()
+        self.cel = torch.nn.CrossEntropyLoss()
+        self.linears = torch.nn.ModuleList([
+            torch.nn.Linear(hidden_dim,
+                            1),
+            torch.nn.Linear(1,
+                            1),
+            torch.nn.Linear(1,
+                            hidden_dim)
+        ])
+        if frozen_weights:
+            self.linears[0].weight.requires_grad = False
+            self.linears[0].bias.requires_grad = False
+
+    def forward(self, x, y):
+        for l in self.linears:
+            x = l(x)
+            x = self.act(x)
+        loss = self.cel(x, y)
+        val = (x, loss)
+        return val
+
+
+def run_fragmented_model(model, config_dict, hidden_dim, dtype):
+    model, _, _, _ = deepspeed.initialize(model=model,
+                                            model_parameters=model.parameters(),
+                                            config=config_dict)
+    data_loader = random_dataloader(model=model,
+                                    total_samples=10,
+                                    hidden_dim=hidden_dim,
+                                    device=model.device,
+                                    dtype=dtype)
+    dist.barrier()
+    for n, batch in enumerate(data_loader):
+        loss = model(batch[0], batch[1])
+        loss = loss[1]
+        model.backward(loss)
+        validate_full_tensors(model)
+        model.step()
+
+
+@pytest.mark.parametrize('frozen_weights', [True, False])
+class TestTensorFragment(DistributedTest):
+    # Need multiple gpus to test possible hanging
+    world_size = 2
+
+    @pytest.mark.parametrize('zero_stage', [1, 2, 3])
+    @pytest.mark.parametrize(
+        'offload_device',
+        [OffloadDeviceEnum.none,
+         OffloadDeviceEnum.cpu,
+         OffloadDeviceEnum.nvme])
+    def test_zero_fragments(self, tmpdir, zero_stage, offload_device, frozen_weights):
+        if offload_device == OffloadDeviceEnum.nvme:
+            if zero_stage != 3:
+                pytest.skip(f"Nvme offload not supported for zero stage {zero_stage}")
+            if not deepspeed.ops.__compatible_ops__[AsyncIOBuilder.NAME]:
+                pytest.skip('Skip tests since async-io is not compatible')
+
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-6
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 2
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            }
+        }
+
+        if offload_device == OffloadDeviceEnum.cpu:
+            config_dict["zero_optimization"]["offload_optimizer"] = {
+                "device": offload_device
+            }
+        elif offload_device == OffloadDeviceEnum.nvme:
+            config_dict["zero_optimization"]["offload_optimizer"] = {
+                "device": offload_device,
+                "nvme_path": str(tmpdir)
+            }
+
+        hidden_dim = 128
+        if zero_stage == 3:
+            with deepspeed.zero.Init(config_dict_or_path=config_dict):
+                model = MyModel(hidden_dim, frozen_weights)
+        else:
+            model = MyModel(hidden_dim, frozen_weights)
+
+        run_fragmented_model(model, config_dict, hidden_dim, torch.float16)
+
+    def test_bf16_fragments(self, frozen_weights):
+        if frozen_weights:
+            pytest.skip("TODO: Frozen weights not currently supported by BF16 Optimizer")
+
+        if not bf16_required_version_check(accelerator_check=False):
+            pytest.skip(
+                " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
+            )
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-6
+                }
+            },
+            "bf16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": 0,
+            }
+        }
+
+        hidden_dim = 128
+        model = MyModel(hidden_dim, frozen_weights)
+        run_fragmented_model(model, config_dict, hidden_dim, torch.bfloat16)
diff --git a/tests/unit/runtime/zero/test_zero_tiled.py b/tests/unit/runtime/zero/test_zero_tiled.py
new file mode 100644
index 0000000000000000000000000000000000000000..5858b59368721d1ca840fcf711b6dcc8e3ec40e6
--- /dev/null
+++ b/tests/unit/runtime/zero/test_zero_tiled.py
@@ -0,0 +1,173 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import copy
+
+import torch
+from deepspeed.runtime.zero.tiling import TiledLinear, TiledLinearReturnBias
+
+import pytest
+
+
+@pytest.mark.parametrize('in_splits,out_splits', [(1, 1), (2, 2), (5, 5), (32, 32)])
+def test_tiled_init(in_splits, out_splits):
+    in_f = 32
+    out_f = 40
+    base = torch.nn.Linear(in_f, out_f, bias=True)
+    l = TiledLinear(in_f,
+                    out_f,
+                    bias=True,
+                    init_linear=copy.deepcopy(base),
+                    out_splits=out_splits,
+                    in_splits=in_splits)
+
+    for out_id in range(out_splits):
+        for in_id in range(in_splits):
+            local_l = l.linears[out_id][in_id]
+            assert isinstance(local_l, torch.nn.Linear)
+
+            rstart = l.out_parts[out_id]
+            rstop = l.out_parts[out_id + 1]
+            cstart = l.in_parts[in_id]
+            cstop = l.in_parts[in_id + 1]
+
+            local_out = rstop - rstart
+            local_in = cstop - cstart
+            assert local_l.weight.size()[1] == local_in, f'local[{out_id}][{in_id}].size {local_l.weight.size()}'
+            assert local_l.weight.size()[0] == local_out
+
+            test = base.weight[rstart:rstop, cstart:cstop]
+
+            assert local_l.weight.size() == test.size()
+            assert torch.equal(local_l.weight.data, test.data)
+
+            if in_id == in_splits - 1:
+                assert local_l.bias is not None
+                assert local_l.bias.size()[0] == local_out
+            else:
+                assert local_l.bias is None
+
+
+@pytest.mark.parametrize('in_splits,out_splits', [(0, 0), (33, 33)])
+def test_tiled_baddim(in_splits, out_splits):
+    dim = 32
+    with pytest.raises(RuntimeError):
+        l = TiledLinear(dim, dim, out_splits=out_splits, in_splits=in_splits)
+
+
+@pytest.mark.skip(reason="seeing nondeterministic failures, skipping for now")
+@pytest.mark.parametrize('bias', [False, True])
+@pytest.mark.parametrize('in_splits,out_splits', [(1, 1), (2, 2)])
+@pytest.mark.parametrize('in_f,out_f', [(32, 32), (23, 29), (29, 23)])
+def test_tiled_forward(in_splits, out_splits, bias, in_f, out_f):
+    base = torch.nn.Linear(in_f, out_f, bias=bias)
+    test = TiledLinear(in_f,
+                       out_f,
+                       bias=bias,
+                       init_linear=copy.deepcopy(base),
+                       out_splits=out_splits,
+                       in_splits=in_splits)
+
+    inp = torch.rand(in_f)
+
+    base_out = base(copy.deepcopy(inp))
+    test_out = test(copy.deepcopy(inp))
+
+    assert torch.allclose(base_out, test_out, rtol=1e-4)
+
+
+@pytest.mark.skip(reason="seeing nondeterministic failures, skipping for now")
+@pytest.mark.parametrize('bias', [False, True])
+@pytest.mark.parametrize('in_splits,out_splits', [(1, 1), (2, 2)])
+@pytest.mark.parametrize('in_f,out_f', [(32, 32), (23, 29), (29, 23)])
+def test_tiled_backward(in_splits, out_splits, bias, in_f, out_f):
+    base = torch.nn.Linear(in_f, out_f, bias=bias)
+    test = TiledLinear(in_f,
+                       out_f,
+                       bias=bias,
+                       init_linear=copy.deepcopy(base),
+                       out_splits=out_splits,
+                       in_splits=in_splits)
+
+    inp = torch.rand(in_f)
+
+    base_out = base(copy.deepcopy(inp))
+    test_out = test(copy.deepcopy(inp))
+    assert torch.allclose(base_out, test_out, rtol=1e-4)
+
+    base_out.sum().backward()
+    test_out.sum().backward()
+
+    # compare grads
+    for row in range(out_splits):
+        rstart = test.out_parts[row]
+        rstop = test.out_parts[row + 1]
+
+        for col in range(in_splits):
+            cstart = test.in_parts[col]
+            cstop = test.in_parts[col + 1]
+
+            local = test.linears[row][col]
+            base_grad = base.weight.grad[rstart:rstop, cstart:cstop]
+            assert torch.allclose(base_grad, local.weight.grad, rtol=1e-4)
+
+            if local.bias is not None:
+                base_grad = base.bias.grad[rstart:rstop]
+                assert torch.allclose(base_grad, local.bias.grad, rtol=1e-4)
+
+
+class LinearWrapper(torch.nn.Linear):
+    """Returns its own bias to simulate Megatron-LM's behavior.
+
+    Megatron-LM optionally delays the bias addition to fuse with a proceeding kernel.
+    """
+    def forward(self, input):
+        out = super().forward(input)
+        return out, self.bias
+
+
+@pytest.mark.skip(reason="seeing nondeterministic failures, skipping for now")
+@pytest.mark.parametrize('bias', [False, True])
+@pytest.mark.parametrize('in_splits,out_splits', [(1, 1), (2, 2)])
+@pytest.mark.parametrize('in_f,out_f', [(32, 32), (23, 29), (29, 23)])
+def test_tiled_returnbias_backward(in_splits, out_splits, bias, in_f, out_f):
+    base = LinearWrapper(in_f, out_f, bias=bias)
+    test = TiledLinearReturnBias(in_f,
+                                 out_f,
+                                 bias=bias,
+                                 linear_cls=LinearWrapper,
+                                 init_linear=copy.deepcopy(base),
+                                 out_splits=out_splits,
+                                 in_splits=in_splits)
+
+    inp = torch.rand(in_f)
+
+    base_out_t, base_out_b = base(copy.deepcopy(inp))
+    test_out_t, test_out_b = test(copy.deepcopy(inp))
+    assert torch.allclose(base_out_t, test_out_t, rtol=1e-4)
+    if base_out_b is None:
+        assert test_out_b is None
+        base_out_b = torch.zeros_like(base_out_t)
+        test_out_b = torch.zeros_like(test_out_t)
+    else:
+        assert test_out_b is not None
+        assert torch.allclose(base_out_b, test_out_b, rtol=1e-4)
+
+    (base_out_t + base_out_b).sum().backward()
+    (test_out_t + test_out_b).sum().backward()
+
+    # compare grads
+    for row in range(out_splits):
+        rstart = test.out_parts[row]
+        rstop = test.out_parts[row + 1]
+
+        for col in range(in_splits):
+            cstart = test.in_parts[col]
+            cstop = test.in_parts[col + 1]
+
+            local = test.linears[row][col]
+            base_grad = base.weight.grad[rstart:rstop, cstart:cstop]
+            assert torch.allclose(base_grad, local.weight.grad, rtol=1e-4)
+
+            if local.bias is not None:
+                base_grad = base.bias.grad[rstart:rstop]
+                assert torch.allclose(base_grad, local.bias.grad, rtol=1e-4)
diff --git a/tests/unit/runtime/zero/utils.py b/tests/unit/runtime/zero/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f0687892d43ac6a78cb51d36ea6c7ad523cad39
--- /dev/null
+++ b/tests/unit/runtime/zero/utils.py
@@ -0,0 +1,13 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
+from unit.common import get_master_port
+
+
+def setup_serial_env():
+    # Setup for a serial run
+    os.environ['MASTER_ADDR'] = '127.0.0.1'
+    os.environ['MASTER_PORT'] = get_master_port()
+    os.environ['LOCAL_RANK'] = '0'
+    os.environ['RANK'] = '0'
+    os.environ['WORLD_SIZE'] = '1'
diff --git a/tests/unit/simple_model.py b/tests/unit/simple_model.py
index aaa8ffc67971516ff99422f43baa9e6b25207208..481aae0bfdcd69a62aa5b73f5f8922338cb7546b 100644
--- a/tests/unit/simple_model.py
+++ b/tests/unit/simple_model.py
@@ -1,3 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import os
 import json
 import argparse
@@ -5,6 +7,9 @@ import torch
 
 from deepspeed.pipe import PipelineModule, LayerSpec
 from deepspeed.moe.layer import MoE
+from deepspeed.accelerator import get_accelerator
+
+import deepspeed.comm as dist
 
 
 class SimpleModel(torch.nn.Module):
@@ -41,9 +46,16 @@ class SimpleMoEModel(torch.nn.Module):
     def __init__(self, hidden_dim, num_experts=4, ep_size=1, use_residual=False):
         super(SimpleMoEModel, self).__init__()
         self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
-        linear2 = torch.nn.Linear(hidden_dim, hidden_dim)
+        expert = torch.nn.Linear(hidden_dim, hidden_dim)
+        # using two MoE layers to check implications of sharing a single storage
         self.linear2 = MoE(hidden_size=hidden_dim,
-                           expert=linear2,
+                           expert=expert,
+                           ep_size=ep_size,
+                           use_residual=use_residual,
+                           num_experts=num_experts,
+                           k=1)
+        self.linear3 = MoE(hidden_size=hidden_dim,
+                           expert=expert,
                            ep_size=ep_size,
                            use_residual=use_residual,
                            num_experts=num_experts,
@@ -51,9 +63,9 @@ class SimpleMoEModel(torch.nn.Module):
         self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
 
     def forward(self, x, y):
-        hidden_dim = x
-        hidden_dim = self.linear(hidden_dim)
+        hidden_dim = self.linear(x)
         output, _, _ = self.linear2(hidden_dim)
+        output, _, _ = self.linear3(output)
         hidden_dim = hidden_dim + output
         sentence_embed = hidden_dim.mean(1)
         return self.cross_entropy_loss(sentence_embed, y)
@@ -261,10 +273,10 @@ def create_deepspeed_args():
     parser = argparse.ArgumentParser()
     args = parser.parse_args(args='')
     args.deepspeed = True
-    if torch.distributed.is_initialized():
+    if dist.is_initialized():
         # We assume up to one full node executing unit tests
-        assert torch.distributed.get_world_size() <= torch.cuda.device_count()
-        args.local_rank = torch.distributed.get_rank()
+        assert dist.get_world_size() <= get_accelerator().device_count()
+        args.local_rank = dist.get_rank()
     return args
 
 
diff --git a/tests/unit/test_output/ds_logs/test/events.out.tfevents.1679370169.9dad78d721ca.29247.0 b/tests/unit/test_output/ds_logs/test/events.out.tfevents.1679370169.9dad78d721ca.29247.0
new file mode 100644
index 0000000000000000000000000000000000000000..25f4ac0ae26c93aace54844e1b8adc2365635d02
Binary files /dev/null and b/tests/unit/test_output/ds_logs/test/events.out.tfevents.1679370169.9dad78d721ca.29247.0 differ
diff --git a/tests/unit/test_output/ds_logs/test/events.out.tfevents.1679370366.9dad78d721ca.39331.0 b/tests/unit/test_output/ds_logs/test/events.out.tfevents.1679370366.9dad78d721ca.39331.0
new file mode 100644
index 0000000000000000000000000000000000000000..365f547ef19ed862babf4222d3fb52450ac14c61
Binary files /dev/null and b/tests/unit/test_output/ds_logs/test/events.out.tfevents.1679370366.9dad78d721ca.39331.0 differ
diff --git a/tests/unit/util.py b/tests/unit/util.py
index 966733b1d9290dc9fef80fc0fb1a25e101e127c9..2face75846d7b32df09af1d0cf15abd79d33a787 100644
--- a/tests/unit/util.py
+++ b/tests/unit/util.py
@@ -1,3 +1,5 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import torch
 from deepspeed.git_version_info import torch_info
 
@@ -12,21 +14,50 @@ def required_torch_version():
         return False
 
 
-def bf16_required_version_check():
-    TORCH_MAJOR = int(torch.__version__.split('.')[0])
-    TORCH_MINOR = int(torch.__version__.split('.')[1])
+def bf16_required_version_check(accelerator_check=True):
+    split_version = lambda x: map(int, x.split('.')[:2])
+    TORCH_MAJOR, TORCH_MINOR = split_version(torch_info['version'])
+    NCCL_MAJOR, NCCL_MINOR = split_version(torch_info['nccl_version'])
+    CUDA_MAJOR, CUDA_MINOR = split_version(torch_info['cuda_version'])
 
-    if type(torch.cuda.nccl.version()) != tuple:
-        return False
+    # Sometimes bf16 tests are runnable even if not natively supported by accelerator
+    if accelerator_check:
+        accelerator_pass = torch_info['bf16_support']
     else:
-        NCCL_MAJOR = torch.cuda.nccl.version()[0]
-        NCCL_MINOR = torch.cuda.nccl.version()[1]
+        accelerator_pass = True
 
-    CUDA_MAJOR = int(torch_info['cuda_version'].split('.')[0])
     if (TORCH_MAJOR > 1 or
         (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)) and (CUDA_MAJOR >= 11) and (
             NCCL_MAJOR > 2 or
-            (NCCL_MAJOR == 2 and NCCL_MINOR >= 10)) and torch.cuda.is_bf16_supported():
+            (NCCL_MAJOR == 2 and NCCL_MINOR >= 10)) and accelerator_pass:
         return True
     else:
         return False
+
+
+def required_minimum_torch_version(major_version, minor_version):
+    TORCH_MAJOR = int(torch.__version__.split('.')[0])
+    TORCH_MINOR = int(torch.__version__.split('.')[1])
+
+    if TORCH_MAJOR < major_version:
+        return False
+
+    return TORCH_MAJOR > major_version or TORCH_MINOR >= minor_version
+
+
+def required_maximum_torch_version(major_version, minor_version):
+    TORCH_MAJOR = int(torch.__version__.split('.')[0])
+    TORCH_MINOR = int(torch.__version__.split('.')[1])
+
+    if TORCH_MAJOR > major_version:
+        return False
+
+    return TORCH_MAJOR < major_version or TORCH_MINOR <= minor_version
+
+
+def required_amp_check():
+    from importlib.util import find_spec
+    if find_spec('apex') is None:
+        return False
+    else:
+        return True
diff --git a/tests/unit/utils/test_get_optim_files.py b/tests/unit/utils/test_get_optim_files.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccbd9626d6e85ebbb6e5507f010531d8ec68d556
--- /dev/null
+++ b/tests/unit/utils/test_get_optim_files.py
@@ -0,0 +1,19 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
+import pytest
+from deepspeed.utils.zero_to_fp32 import get_optim_files
+
+
+@pytest.mark.parametrize('num_checkpoints', [1, 2, 12, 24])
+def test_get_optim_files(tmpdir, num_checkpoints):
+    saved_files = []
+    for i in range(num_checkpoints):
+        file_name = "zero_" + str(i) + "_optim_states.pt"
+        path_name = os.path.join(tmpdir, file_name)
+        saved_files.append(path_name)
+        with open(path_name, "w") as f:
+            f.write(file_name)
+    loaded_files = get_optim_files(tmpdir)
+    for lf, sf in zip(loaded_files, saved_files):
+        assert lf == sf
diff --git a/tests/unit/utils/test_groups.py b/tests/unit/utils/test_groups.py
new file mode 100644
index 0000000000000000000000000000000000000000..06b391e2e3014997f0e8d5442849f9f5a7aabd34
--- /dev/null
+++ b/tests/unit/utils/test_groups.py
@@ -0,0 +1,55 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from deepspeed.utils.groups import _get_expert_parallel_ranks
+
+
+def test_get_expert_parallel_ranks():
+    """
+    Example - E + M + D parallel
+    world_size = 16
+    model_degree = 2
+    expert_degree = 4 # number of experts in same group
+    mp_group = [0, 1], [2,3], [4,5] ...
+    data_parallel_group =[0,2,4,6,8,10, 12,14],                 [1,3,5,7,9,11,13,15]
+    expert_parallel_group = [0,2,4,6], [8,10,12,14]             [1,3,5,7], [9,11,13,15]
+    expert_data_parallel_group = [0,8],[2,10],[4,12],[6,14],    [1,9],[3,11],[5,13],[7,15]
+    """
+    expert_parallel_groups, expert_data_parallel_groups = _get_expert_parallel_ranks(
+        world_size=16, model_parallel_size_=2, expert_parallel_size_=4
+    )
+    assert expert_parallel_groups == [
+        [0,
+         2,
+         4,
+         6],
+        [8,
+         10,
+         12,
+         14],
+        [1,
+         3,
+         5,
+         7],
+        [9,
+         11,
+         13,
+         15],
+    ]
+    assert expert_data_parallel_groups == [
+        [0,
+         8],
+        [2,
+         10],
+        [4,
+         12],
+        [6,
+         14],
+        [1,
+         9],
+        [3,
+         11],
+        [5,
+         13],
+        [7,
+         15],
+    ]
diff --git a/tests/unit/utils/test_init_on_device.py b/tests/unit/utils/test_init_on_device.py
new file mode 100644
index 0000000000000000000000000000000000000000..25d102fd05a7a281a58bebac448d785cd87e0541
--- /dev/null
+++ b/tests/unit/utils/test_init_on_device.py
@@ -0,0 +1,26 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import pytest
+from unit.simple_model import SimpleModel
+from deepspeed import OnDevice
+from packaging import version as pkg_version
+from deepspeed.accelerator import get_accelerator
+from unit.common import DistributedTest
+
+
+@pytest.mark.parametrize('device', ['meta', get_accelerator().device_name(0)])
+class TestOnDevice(DistributedTest):
+    world_size = 1
+
+    def test_on_device(self, device):
+        if device == "meta" and pkg_version.parse(
+                torch.__version__) < pkg_version.parse("1.10"):
+            pytest.skip("meta tensors only became stable after torch 1.10")
+
+        with OnDevice(dtype=torch.half, device=device):
+            model = SimpleModel(4)
+
+        for p in model.parameters():
+            assert p.device == torch.device(device)
+            assert p.dtype == torch.half
diff --git a/version.txt b/version.txt
index 844f6a91acb92e5f4c58fe0d440fba8deea2a8c8..100435be135a32ae8974fe4dd281c4d3a9d62e02 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.6.3
+0.8.2